Merge commit for internal changes

2016-06-07 10:10:03 -07:00 · 2016-06-07 10:10:03 -07:00 · b7c416926e
commit b7c416926e
parent 984b35906a 8aaa5b0926
104 changed files with 4860 additions and 965 deletions
--- a/eigen.BUILD
+++ b/eigen.BUILD
@ -1,6 +1,6 @@
 package(default_visibility = ["//visibility:public"])

-archive_dir = "eigen-eigen-d02e6a705c30"
+archive_dir = "eigen-eigen-0c0b79ecd74c"

 cc_library(
    name = "eigen",
--- a/tensorflow/contrib/cmake/external/eigen.cmake
+++ b/tensorflow/contrib/cmake/external/eigen.cmake
@ -7,7 +7,7 @@

 include (ExternalProject)

-set(eigen_archive_hash "d02e6a705c30")
+set(eigen_archive_hash "0c0b79ecd74c")

 set(eigen_INCLUDE_DIRS
    ${CMAKE_CURRENT_BINARY_DIR}
@ -16,7 +16,7 @@ set(eigen_INCLUDE_DIRS
    ${tensorflow_source_dir}/third_party/eigen3
 )
 set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
-set(eigen_HASH SHA256=532956172daa8aba87c750791ff89a5c38cdb07e2525afe17ecb4bef812d67cf)
+set(eigen_HASH SHA256=b4b5884b03bd4bae114d02b36e2435ad1504ed8e51431d16c876b6f6a365882b)
 set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
 set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)

--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@ -198,6 +198,30 @@ py_test(
    ],
 )

+py_test(
+    name = "experiment_test",
+    size = "small",
+    srcs = ["python/learn/tests/experiment_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_test(
+    name = "learn_runner_test",
+    size = "small",
+    srcs = ["python/learn/tests/learn_runner_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
 py_test(
    name = "tensor_signature_test",
    srcs = ["python/learn/estimators/tensor_signature_test.py"],
@ -439,6 +463,19 @@ py_test(
    ],
 )

+py_test(
+    name = "stability_test",
+    size = "small",
+    srcs = ["python/learn/tests/stability_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
 py_binary(
    name = "inspect_checkpoint",
    srcs = [
--- a/tensorflow/contrib/learn/python/learn/init.py
+++ b/tensorflow/contrib/learn/python/learn/init.py
@ -21,6 +21,7 @@ from __future__ import print_function

 import numpy as np

+# pylint: disable=wildcard-import
 from tensorflow.contrib.learn.python.learn import datasets
 from tensorflow.contrib.learn.python.learn import estimators
 from tensorflow.contrib.learn.python.learn import graph_actions
@ -30,9 +31,9 @@ from tensorflow.contrib.learn.python.learn import monitors
 from tensorflow.contrib.learn.python.learn import ops
 from tensorflow.contrib.learn.python.learn import preprocessing
 from tensorflow.contrib.learn.python.learn import utils
-# pylint: disable=wildcard-import
 from tensorflow.contrib.learn.python.learn.dataframe import *
 from tensorflow.contrib.learn.python.learn.estimators import *
+from tensorflow.contrib.learn.python.learn.experiment import Experiment
 from tensorflow.contrib.learn.python.learn.graph_actions import evaluate
 from tensorflow.contrib.learn.python.learn.graph_actions import infer
 from tensorflow.contrib.learn.python.learn.graph_actions import NanLossDuringTrainingError
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@ -50,11 +50,11 @@ class DNNClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):

      def input_fn_eval: # returns x, Y
        pass
-      estimator.evaluate(input_fn_eval)
-      estimator.predict(x)
+      estimator.evaluate(input_fn=input_fn_eval)
+      estimator.predict(x=x)
      ```

-    Input of `fit`, `train`, and `evaluate` should have following features,
+    Input of `fit` and `evaluate` should have following features,
      otherwise there will be a `KeyError`:
        if `weight_column_name` is not `None`, a feature with
          `key=weight_column_name` whose value is a `Tensor`.
@ -145,11 +145,11 @@ class DNNRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):

      def input_fn_eval: # returns x, Y
        pass
-      estimator.evaluate(input_fn_eval)
-      estimator.predict(x)
+      estimator.evaluate(input_fn=input_fn_eval)
+      estimator.predict(x=x)
      ```

-    Input of `fit`, `train`, and `evaluate` should have following features,
+    Input of `fit` and `evaluate` should have following features,
      otherwise there will be a `KeyError`:
        if `weight_column_name` is not `None`, a feature with
          `key=weight_column_name` whose value is a `Tensor`.
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@ -397,12 +397,12 @@ class DNNLinearCombinedClassifier(_DNNLinearCombinedBaseEstimator):
      ...
    def input_fn_eval: # returns x, y
      ...
-    estimator.train(input_fn_train)
-    estimator.evaluate(input_fn_eval)
-    estimator.predict(x)
+    estimator.fit(input_fn=input_fn_train)
+    estimator.evaluate(input_fn=input_fn_eval)
+    estimator.predict(x=x)
    ```

-    Input of `fit`, `train`, and `evaluate` should have following features,
+    Input of `fit` and `evaluate` should have following features,
      otherwise there will be a `KeyError`:
        if `weight_column_name` is not `None`, a feature with
          `key=weight_column_name` whose value is a `Tensor`.
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
@ -42,7 +42,14 @@ def _prepare_iris_data_for_logistic_regression():
  return iris


-def _iris_input_fn():
+def _iris_input_multiclass_fn():
+  iris = tf.contrib.learn.datasets.load_iris()
+  return {
+      'feature': tf.constant(iris.data, dtype=tf.float32)
+  }, tf.constant(iris.target, shape=[150, 1], dtype=tf.int32)
+
+
+def _iris_input_logistic_fn():
  iris = _prepare_iris_data_for_logistic_regression()
  return {
      'feature': tf.constant(iris.data, dtype=tf.float32)
@ -64,8 +71,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase):
        dnn_feature_columns=cont_features,
        dnn_hidden_units=[3, 3])

-    classifier.fit(input_fn=_iris_input_fn, steps=100)
-    scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
+    classifier.fit(input_fn=_iris_input_logistic_fn, steps=100)
+    scores = classifier.evaluate(input_fn=_iris_input_logistic_fn, steps=100)
    self.assertGreater(scores['accuracy'], 0.9)

  def testLogisticRegression_TensorData(self):
@ -127,8 +134,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase):
        dnn_feature_columns=cont_features,
        dnn_hidden_units=[3, 3])

-    classifier.fit(input_fn=_iris_input_fn, steps=100)
-    scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
+    classifier.fit(input_fn=_iris_input_multiclass_fn, steps=100)
+    scores = classifier.evaluate(input_fn=_iris_input_multiclass_fn, steps=100)
    self.assertGreater(scores['accuracy'], 0.9)

  def testWeightColumn(self):
@ -210,8 +217,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase):
        dnn_hidden_units=[3, 3],
        dnn_optimizer=tf.train.AdagradOptimizer(learning_rate=0.1))

-    classifier.fit(input_fn=_iris_input_fn, steps=100)
-    scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
+    classifier.fit(input_fn=_iris_input_logistic_fn, steps=100)
+    scores = classifier.evaluate(input_fn=_iris_input_logistic_fn, steps=100)
    self.assertGreater(scores['accuracy'], 0.9)

  def testCustomOptimizerByString(self):
@ -230,8 +237,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase):
        dnn_hidden_units=[3, 3],
        dnn_optimizer='Adagrad')

-    classifier.fit(input_fn=_iris_input_fn, steps=100)
-    scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
+    classifier.fit(input_fn=_iris_input_logistic_fn, steps=100)
+    scores = classifier.evaluate(input_fn=_iris_input_logistic_fn, steps=100)
    self.assertGreater(scores['accuracy'], 0.9)

  def testPredict(self):
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@ -111,8 +111,8 @@ class BaseEstimator(sklearn.BaseEstimator):
    self._model_dir = model_dir
    if self._model_dir is None:
      self._model_dir = tempfile.mkdtemp()
-      logging.info('Using temporary folder as model directory: %s',
-                   self._model_dir)
+      logging.warning('Using temporary folder as model directory: %s',
+                      self._model_dir)

    # Create a run configuration
    if config is None:
@ -135,9 +135,8 @@ class BaseEstimator(sklearn.BaseEstimator):

    self._graph = None

-  def fit(
-      self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
-      monitors=None):
+  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
+          monitors=None):
    """Trains a model given training data `x` predictions and `y` targets.

    Args:
@ -421,21 +420,20 @@ class BaseEstimator(sklearn.BaseEstimator):
                   monitors=None,
                   log_every_steps=100,
                   fail_on_nan_loss=True):
-    # TODO(wicke): This is a hack and needs to go.
-    if self._config.execution_mode not in ('all', 'train'):
-      return
+    # TODO(wicke): Remove this once Model and associated code are gone.
+    if hasattr(self._config, 'execution_mode'):
+      if self._config.execution_mode not in ('all', 'train'):
+        return

-    if not self._model_dir:
-      raise ValueError('Estimator\'s model_dir should be non-empty.')
-
-    # Stagger startup of worker sessions based on task id.
-    sleep_secs = min(self._config.training_worker_max_startup_secs,
-                     self._config.task *
-                     self._config.training_worker_session_startup_stagger_secs)
-    if sleep_secs:
-      logging.info('Waiting %d secs before starting task %d.', sleep_secs,
-                   self._config.task)
-      time.sleep(sleep_secs)
+      # Stagger startup of worker sessions based on task id.
+      sleep_secs = min(
+          self._config.training_worker_max_startup_secs,
+          self._config.task *
+          self._config.training_worker_session_startup_stagger_secs)
+      if sleep_secs:
+        logging.info('Waiting %d secs before starting task %d.', sleep_secs,
+                     self._config.task)
+        time.sleep(sleep_secs)

    # Device allocation
    device_fn = device_fn or self._device_fn
@ -454,7 +452,7 @@ class BaseEstimator(sklearn.BaseEstimator):
      monitors += monitors_lib.get_default_monitors(
          loss_op=loss_op,
          summary_op=logging_ops.get_summary_op(),
-          save_summary_steps=100,
+          save_summary_steps=self._config.save_summary_steps,
          summary_writer=graph_actions.get_summary_writer(self._model_dir))

      is_chief = self._config.task == 0
@ -478,8 +476,9 @@ class BaseEstimator(sklearn.BaseEstimator):
          log_every_steps=log_every_steps,
          supervisor_is_chief=is_chief,
          supervisor_master=self._config.master,
+          supervisor_save_model_secs=self._config.save_checkpoints_secs,
          feed_fn=feed_fn,
-          max_steps=steps,
+          steps=steps,
          fail_on_nan_loss=fail_on_nan_loss,
          monitors=monitors)

@ -513,8 +512,9 @@ class BaseEstimator(sklearn.BaseEstimator):
                      feed_fn=None,
                      metrics=None,
                      name=''):
-    # TODO(wicke): This is a hack and needs to go.
-    if self._config.execution_mode not in ('all', 'evaluate', 'eval_evalset'):
+    # TODO(wicke): Remove this once Model and associated code are gone.
+    if (hasattr(self._config, 'execution_mode') and
+        self._config.execution_mode not in ('all', 'evaluate', 'eval_evalset')):
      return

    # Check that model has been trained.
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@ -46,10 +46,10 @@ class LinearClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
      ...
    estimator.fit(input_fn=input_fn_train)
    estimator.evaluate(input_fn=input_fn_eval)
-    estimator.predict(x)
+    estimator.predict(x=x)
    ```

-    Input of `fit`, `train`, and `evaluate` should have following features,
+    Input of `fit` and `evaluate` should have following features,
      otherwise there will be a `KeyError`:
        if `weight_column_name` is not `None`, a feature with
          `key=weight_column_name` whose value is a `Tensor`.
@ -126,10 +126,10 @@ class LinearRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
      ...
    estimator.fit(input_fn=input_fn_train)
    estimator.evaluate(input_fn=input_fn_eval)
-    estimator.predict(x)
+    estimator.predict(x=x)
    ```

-    Input of `fit`, `train`, and `evaluate` should have following features,
+    Input of `fit` and `evaluate` should have following features,
      otherwise there will be a KeyError:
        if `weight_column_name` is not None:
          key=weight_column_name, value=a `Tensor`
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@ -24,79 +24,54 @@ from tensorflow.python import GPUOptions


 class RunConfig(object):
-  """This class specifies the specific configurations for the run.
+  """This class specifies the specific configurations for the run."""

-  Parameters:
-    execution_mode: Runners use this flag to execute different tasks, like
-      training vs evaluation. 'all' (the default) executes both training and
-      eval.
-    master: TensorFlow master. Empty string (the default) for local.
-    task: Task id of the replica running the training (default: 0).
-    num_ps_replicas: Number of parameter server tasks to use (default: 0).
-    training_worker_session_startup_stagger_secs: Seconds to sleep between the
-      startup of each worker task session (default: 5).
-    training_worker_max_startup_secs: Max seconds to wait before starting any
-      worker (default: 60).
-    eval_delay_secs: Number of seconds between the beginning of each eval run.
-      If one run takes more than this amount of time, the next run will start
-      immediately once that run completes (default 60).
-    eval_steps: Number of steps to run in each eval (default: 100).
-    num_cores: Number of cores to be used (default: 4).
-    verbose: Controls the verbosity, possible values:
-      0: the algorithm and debug information is muted.
-      1: trainer prints the progress.
-      2: log device placement is printed.
-    gpu_memory_fraction: Fraction of GPU memory used by the process on
-      each GPU uniformly on the same machine.
-    tf_random_seed: Random seed for TensorFlow initializers.
-      Setting this value allows consistency between reruns.
-    keep_checkpoint_max: The maximum number of recent checkpoint files to keep.
-      As new files are created, older files are deleted.
-      If None or 0, all checkpoint files are kept.
-      Defaults to 5 (that is, the 5 most recent checkpoint files are kept.)
-    keep_checkpoint_every_n_hours: Number of hours between each checkpoint
-      to be saved. The default value of 10,000 hours effectively disables
-      the feature.
-
-  Attributes:
-    tf_master: Tensorflow master.
-    tf_config: Tensorflow Session Config proto.
-    tf_random_seed: Tensorflow random seed.
-    keep_checkpoint_max: Maximum number of checkpoints to keep.
-    keep_checkpoint_every_n_hours: Number of hours between each checkpoint.
-  """
-
-  # TODO(wicke): Cull unused options.
+  # TODO(wicke): Move options out once functionality is covered by monitors
  def __init__(self,
-               execution_mode='all',
               master='',
               task=0,
               num_ps_replicas=0,
-               training_worker_session_startup_stagger_secs=5,
-               training_worker_max_startup_secs=60,
-               eval_delay_secs=60,
-               eval_steps=100,
               num_cores=4,
-               verbose=1,
+               log_device_placement=False,
               gpu_memory_fraction=1,
               tf_random_seed=42,
+               save_summary_steps=100,
+               save_checkpoints_secs=60,
               keep_checkpoint_max=5,
               keep_checkpoint_every_n_hours=10000):
-    self.execution_mode = execution_mode
+    """Constructor.
+
+    Args:
+      master: TensorFlow master. Empty string (the default) for local.
+      task: Task id of the replica running the training (default: 0).
+      num_ps_replicas: Number of parameter server tasks to use (default: 0).
+      num_cores: Number of cores to be used (default: 4).
+      log_device_placement: Log the op placement to devices (default: False).
+      gpu_memory_fraction: Fraction of GPU memory used by the process on
+        each GPU uniformly on the same machine.
+      tf_random_seed: Random seed for TensorFlow initializers.
+        Setting this value allows consistency between reruns.
+      save_summary_steps: Save summaries every this many steps.
+      save_checkpoints_secs: Save checkpoints every this many seconds.
+      keep_checkpoint_max: The maximum number of recent checkpoint files to
+        keep. As new files are created, older files are deleted. If None or 0,
+        all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
+        checkpoint files are kept.)
+      keep_checkpoint_every_n_hours: Number of hours between each checkpoint
+        to be saved. The default value of 10,000 hours effectively disables
+        the feature.
+    """
    self.master = master
    self.task = task
    self.num_ps_replicas = num_ps_replicas
-    self.training_worker_session_startup_stagger_secs = (
-        training_worker_session_startup_stagger_secs)
-    self.training_worker_max_startup_secs = training_worker_max_startup_secs
-    self.eval_delay_secs = eval_delay_secs
-    self.eval_steps = eval_steps
    gpu_options = GPUOptions(
        per_process_gpu_memory_fraction=gpu_memory_fraction)
-    self.tf_config = ConfigProto(log_device_placement=(verbose > 1),
+    self.tf_config = ConfigProto(log_device_placement=log_device_placement,
                                 inter_op_parallelism_threads=num_cores,
                                 intra_op_parallelism_threads=num_cores,
                                 gpu_options=gpu_options)
    self.tf_random_seed = tf_random_seed
+    self.save_summary_steps = save_summary_steps
+    self.save_checkpoints_secs = save_checkpoints_secs
    self.keep_checkpoint_max = keep_checkpoint_max
    self.keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@ -0,0 +1,134 @@
+#  Copyright 2016 Google Inc. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Experiment class collecting information needed for a single training run."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from tensorflow.python.platform import tf_logging as logging
+
+
+class Experiment(object):
+  """Experiment is a class containing all information needed to train a model.
+  """
+
+  def __init__(self, estimator,
+               train_input_fn,
+               eval_input_fn,
+               eval_metrics=None):
+    """Constructor for Experiment.
+
+    Args:
+      estimator: `Estimator` object.
+      train_input_fn: function, returns features and targets for training.
+      eval_input_fn: function, returns features and targets for evaluation.
+      eval_metrics: `dict` of string, metric function. If `None`, default set
+        is used.
+    """
+    super(Experiment, self).__init__()
+    self._estimator = estimator
+    self._train_input_fn = train_input_fn
+    self._eval_input_fn = eval_input_fn
+    self._eval_metrics = eval_metrics
+
+  def train(self, steps=None, monitors=None, delay_secs=0):
+    """Fit the estimator using the training data.
+
+    Train the estimator for `steps` steps, after waiting for `delay_secs`
+    seconds. If `steps` is `None`, train forever.
+
+    Args:
+      steps: Perform this many steps of training. `None`, the default, means
+        train forever.
+      monitors: A list of monitors to pass to the `Estimator`'s `fit` function.
+      delay_secs: Start training after this many seconds.
+
+    Returns:
+      The trained estimator.
+    """
+
+    if delay_secs:
+      logging.info("Waiting %d secs before starting training.", delay_secs)
+      time.sleep(delay_secs)
+
+    return self._estimator.fit(input_fn=self._train_input_fn,
+                               steps=steps, monitors=monitors)
+
+  def evaluate(self, steps=None, delay_secs=0):
+    """Evaluate on the evaluation data.
+
+    Runs evaluation on the evaluation data and returns the result. If `steps`
+    is given, only run for this many steps. Start the evaluation after
+    `delay_secs` seconds.
+
+    Args:
+      steps: Run this many steps of evaluation.
+      delay_secs: Start evaluating after waiting for this many seconds.
+
+    Returns:
+      The result of the `evaluate` call to the `Estimator`.
+    """
+
+    if delay_secs:
+      logging.info("Waiting %d secs before starting eval.", delay_secs)
+      time.sleep(delay_secs)
+
+    return self._estimator.evaluate(input_fn=self._eval_input_fn,
+                                    steps=steps,
+                                    metrics=self._eval_metrics)
+
+  def _continuous_eval(self, input_fn, steps=1000, delay_secs=0,
+                       throttle_delay_secs=60):
+    """Run continuous eval on the eval data.
+
+    Run `steps` steps of evaluation on the evaluation data set. This function
+    starts evaluating after `delay_secs` seconds and then runs no more than one
+    evaluation per `throttle_delay_secs`. It never returns.
+
+    Args:
+      input_fn: The input to use for this eval.
+      steps: Number of steps per evaluation run.
+      delay_secs: Start evaluating after this many seconds.
+      throttle_delay_secs: Do not re-evaluate unless the last evaluation was
+        started at least this many seconds ago.
+    """
+    if delay_secs:
+      logging.info("Waiting %f secs before starting eval.", delay_secs)
+      time.sleep(delay_secs)
+
+    while True:
+      start = time.time()
+      self._estimator.evaluate(input_fn=input_fn,
+                               steps=steps,
+                               metrics=self._eval_metrics)
+      duration = time.time() - start
+      if duration < throttle_delay_secs:
+        difference = throttle_delay_secs - duration
+        logging.info("Waiting %f secs before starting next eval run.",
+                     difference)
+        time.sleep(difference)
+
+  def continuous_eval(self, steps=1000, delay_secs=0, throttle_delay_secs=60):
+    self._continuous_eval(self._eval_input_fn, steps=steps,
+                          delay_secs=delay_secs,
+                          throttle_delay_secs=throttle_delay_secs)
+
+  def continuous_eval_on_train_data(self, steps=1000, delay_secs=0,
+                                    throttle_delay_secs=60):
+    self._continuous_eval(self._train_input_fn, steps=steps,
+                          delay_secs=delay_secs,
+                          throttle_delay_secs=throttle_delay_secs)
--- a/tensorflow/contrib/learn/python/learn/graph_actions.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions.py
@ -30,8 +30,9 @@ from six import reraise

 from tensorflow.contrib.framework.python.ops import ops as contrib_ops
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
-from tensorflow.contrib.layers.python.layers import summaries
 from tensorflow.contrib.learn.python.learn import monitors as monitors_lib
+from tensorflow.contrib.learn.python.learn.utils import checkpoints
+from tensorflow.core.framework import summary_pb2
 from tensorflow.python.client import session as tf_session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@ -131,7 +132,7 @@ def train(graph,
          supervisor_save_model_secs=600,
          supervisor_save_summaries_steps=100,
          feed_fn=None,
-          max_steps=None,
+          steps=None,
          fail_on_nan_loss=True,
          monitors=None):
  """Train a model.
@ -173,7 +174,7 @@ def train(graph,
      `supervisor_save_summaries_steps` seconds when training.
    feed_fn: A function that is called every iteration to produce a `feed_dict`
      passed to `session.run` calls. Optional.
-    max_steps: Train until `global_step_tensor` evaluates to this value.
+    steps: Trains for this many steps (e.g. current global step + `steps`).
    fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
      evaluates to `NaN`. If false, continue training as if nothing happened.
    monitors: List of `BaseMonitor` subclass instances. Used for callbacks
@ -192,28 +193,36 @@ def train(graph,
  if not output_dir:
    raise ValueError('Output directory should be non-empty.')

-  global_step_tensor = contrib_variables.assert_or_get_global_step(
-      graph, global_step_tensor)
-  if global_step_tensor is None:
-    raise ValueError('No "global_step" was provided or found in the graph.')
+  with graph.as_default():
+    global_step_tensor = contrib_variables.assert_or_get_global_step(
+        graph, global_step_tensor)
+    if global_step_tensor is None:
+      raise ValueError('No "global_step" was provided or found in the graph.')

-  summary_writer = (get_summary_writer(output_dir)
-                    if supervisor_is_chief else None)
+    # Get current step.
+    try:
+      start_step = checkpoints.load_variable(
+          output_dir, global_step_tensor.name)
+    except (errors.NotFoundError, ValueError):
+      start_step = 0

-  # TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors.
-  if not supervisor_is_chief:
-    # monitors should run only on the chief.
-    monitors = []
-  elif not monitors:
-    monitors = monitors_lib.get_default_monitors(
-        loss_op=loss_op,
-        summary_op=logging_ops.get_summary_op(),
-        save_summary_steps=supervisor_save_summaries_steps,
-        summary_writer=summary_writer)
+    summary_writer = (get_summary_writer(output_dir)
+                      if supervisor_is_chief else None)

-  # Start monitors, can create graph parts.
-  for monitor in monitors:
-    monitor.begin(max_steps=max_steps)
+    # TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors.
+    if not supervisor_is_chief:
+      # monitors should run only on the chief.
+      monitors = []
+    elif not monitors:
+      monitors = monitors_lib.get_default_monitors(
+          loss_op=loss_op,
+          summary_op=logging_ops.get_summary_op(),
+          save_summary_steps=supervisor_save_summaries_steps,
+          summary_writer=summary_writer)
+
+    # Start monitors, can create graph parts.
+    for monitor in monitors:
+      monitor.begin(max_steps=start_step + steps)

  supervisor = tf_supervisor.Supervisor(
      graph,
@ -235,6 +244,7 @@ def train(graph,
    get_current_step = lambda: session.run(global_step_tensor)

    start_step = get_current_step()
+    max_steps = start_step + steps
    last_step = start_step
    last_log_step = start_step
    loss_value = None
@ -375,6 +385,28 @@ def _start_queue_runners(session, coord):
  return threads


+def _eval_results_to_str(eval_results):
+  return ', '.join('%s = %s' % (k, v) for k, v in eval_results.items())
+
+
+def _write_summary_results(output_dir, eval_results, current_global_step):
+  """Writes eval results into summary file in given dir."""
+  logging.info('Saving evaluation summary for %d step: %s' % (
+      current_global_step, _eval_results_to_str(eval_results)))
+  summary_writer = get_summary_writer(output_dir)
+  summary = summary_pb2.Summary()
+  for key in eval_results:
+    if eval_results[key] is None:
+      continue
+    value = summary.value.add()
+    value.tag = key
+    if (isinstance(eval_results[key], np.float32) or
+        isinstance(eval_results[key], float)):
+      value.simple_value = float(eval_results[key])
+  summary_writer.add_summary(summary, current_global_step)
+  summary_writer.close()
+
+
 # TODO(ptucker): Add unit test.
 def evaluate(graph,
             output_dir,
@ -424,32 +456,26 @@ def evaluate(graph,
      eval steps were run.
    global_step: The global step this evaluation corresponds to.
  """
-  global_step_tensor = contrib_variables.assert_or_get_global_step(
-      graph, global_step_tensor)
+  with graph.as_default():
+    global_step_tensor = contrib_variables.assert_or_get_global_step(
+        graph, global_step_tensor)

-  for key, value in eval_dict.items():
-    if not summaries.is_summary_tag_unique(key):
-      continue
-    if isinstance(value, ops.Tensor):
-      summaries.summarize_tensor(value, tag=key)
+    # Create or get summary op, global_step and saver.
+    saver = _get_saver()
+    local_init_op = _get_local_init_op()
+    ready_op = _get_ready_op()

-  # Create or get summary op, global_step and saver.
-  summary_op = logging_ops.get_summary_op()
-  saver = _get_saver()
-  local_init_op = _get_local_init_op()
-  ready_op = _get_ready_op()
+    session_manager = session_manager_lib.SessionManager(
+        local_init_op=local_init_op,
+        ready_op=ready_op)
+    session, initialized = session_manager.recover_session(
+        master=supervisor_master,
+        saver=saver,
+        checkpoint_dir=checkpoint_path)

-  session_manager = session_manager_lib.SessionManager(
-      local_init_op=local_init_op,
-      ready_op=ready_op)
-  session, initialized = session_manager.recover_session(
-      master=supervisor_master,
-      saver=saver,
-      checkpoint_dir=checkpoint_path)
-
-  # Start queue runners.
-  coord = coordinator.Coordinator()
-  threads = _start_queue_runners(session, coord)
+    # Start queue runners.
+    coord = coordinator.Coordinator()
+    threads = _start_queue_runners(session, coord)

  with session:
    if not initialized:
@ -488,8 +514,7 @@ def evaluate(graph,
            duration = time.time() - start_time
            logging.info('Results after %d steps (%.3f sec/batch): %s.',
                         step, float(duration),
-                         ', '.join('%s = %s' % (k, v)
-                                   for k, v in eval_results.items()))
+                         _eval_results_to_str(eval_results))
      finally:
        if eval_results is None or step != eval_step:
          eval_results = session.run(eval_dict, feed_dict=feed_dict)
@ -498,20 +523,6 @@ def evaluate(graph,
        coord.request_stop()
        coord.join(threads, stop_grace_period_secs=120)

-        # Make our own summary writer and write a summary to the eval dir.
-        # Only is feed_fn is not provided.
-        # TODO(ipolosukhin): Convert evaluation to use streaming_metrics,
-        # then we can save for non feed_fn as well.
-        if summary_op is not None and feed_fn is None:
-          summary_writer = None
-          try:
-            summary_writer = get_summary_writer(output_dir)
-            summary_str = session.run(summary_op)
-            if summary_str:
-              summary_writer.add_summary(summary_str, current_global_step)
-          finally:
-            if summary_writer:
-              summary_writer.close()
    # catch OutOfRangeError which is thrown when queue is out of data (and for
    # other reasons as well).
    except errors.OutOfRangeError as e:
@ -526,6 +537,9 @@ def evaluate(graph,
      else:
        logging.warn('Input iterator is exhausted: %s.', e)

+  # Save summaries for this evaluation.
+  _write_summary_results(output_dir, eval_results, current_global_step)
+
  return eval_results, current_global_step


--- a/tensorflow/contrib/learn/python/learn/learn_runner.py
+++ b/tensorflow/contrib/learn/python/learn/learn_runner.py
@ -0,0 +1,75 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs an Experiment."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.experiment import Experiment
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import tf_logging as logging
+
+
+FLAGS = flags.FLAGS
+
+
+flags.DEFINE_string('schedule', '', 'Schedule to run for this experiment. '
+                    'A schedule identifies a method on the Experiment '
+                    'instance returned by the function passed to the '
+                    'run() call')
+flags.DEFINE_string('output_dir', '', 'Base output directory. Made '
+                    'available to the experiment builder function passed '
+                    'to run(). All files written by the Experiment are '
+                    'expected to be written into this directory.')
+
+
+def run(experiment_fn):
+  """Make and run an experiment."""
+
+  if not FLAGS.output_dir:
+    raise RuntimeError('Must specify an output directory (use --output_dir).')
+  if not FLAGS.schedule:
+    raise RuntimeError('Must specify a schedule (use --schedule).')
+
+  if not callable(experiment_fn):
+    raise TypeError('Experiment builder "%s" is not callable.' %
+                    experiment_fn)
+
+  # Call the builder
+  experiment = experiment_fn(output_dir=FLAGS.output_dir)
+  if not isinstance(experiment, Experiment):
+    raise TypeError('Experiment builder did not return an Experiment '
+                    'instance, got %s instead.' % type(experiment))
+
+  # Execute the schedule
+  taskname = FLAGS.schedule
+  if not hasattr(experiment, taskname):
+    logging.error('Schedule references non-existent task %s', taskname)
+    valid_tasks = [x for x in experiment.__dict__
+                   if callable(getattr(experiment, x))]
+    logging.error('Allowed values for this experiment are: %s', valid_tasks)
+    raise ValueError('Schedule references non-existent task %s', taskname)
+
+  task = getattr(experiment, taskname)
+  if not callable(task):
+    logging.error('Schedule references non-callable member %s', taskname)
+    valid_tasks = [x for x in experiment.__dict__
+                   if callable(getattr(experiment, x))]
+    logging.error('Allowed values for this experiment are: %s', valid_tasks)
+    raise TypeError('Schedule references non-callable member %s', taskname)
+
+  return task()
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@ -19,7 +19,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import numpy as np
+import six
+
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import saver
 from tensorflow.python.training import summary_io


@ -135,17 +140,27 @@ class PrintTensor(EveryN):
  """

  def __init__(self, tensor_names, every_n=100, first_n=1):
+    """Initializes PrintTensor monitor.
+
+    Args:
+      tensor_names: `dict` of tag to tensor names or
+          `iterable` of tensor names (strings).
+      every_n: Print every N steps.
+      first_n: Print first N steps.
+    """
    super(PrintTensor, self).__init__(every_n, first_n)
+    if not isinstance(tensor_names, dict):
+      tensor_names = {item: item for item in tensor_names}
    self._tensor_names = tensor_names

  def every_n_step_begin(self, unused_step):
-    return self._tensor_names
+    return list(self._tensor_names.values())

  def every_n_step_end(self, step, outputs):
    stats = []
-    for name in self._tensor_names:
-      if name in outputs:
-        stats.append("%s = %s" % (name, str(outputs[name])))
+    for tag, tensor_name in six.iteritems(self._tensor_names):
+      if tensor_name in outputs:
+        stats.append("%s = %s" % (tag, str(outputs[tensor_name])))
    logging.info("Step %d: %s" % (step, ", ".join(stats)))


@ -179,14 +194,45 @@ class SummarySaver(EveryN):


 class ValidationMonitor(EveryN):
-  """Runs evaluation every n steps.
-
-  Can do early stopping on validation loss if `early_stopping_rounds` provided.
+  """Runs evaluation of the Estimator every n steps.

+  Can do early stopping on validation metrics if
+  `early_stopping_rounds` provided.
  """

-  def __init__(self, x=None, y=None, input_fn=None,
-               every_n_steps=100, early_stopping_rounds=None):
+  def __init__(self, x=None, y=None, input_fn=None, batch_size=None,
+               every_n_steps=100, metrics=None, early_stopping_rounds=None,
+               early_stopping_metric="loss",
+               early_stopping_metric_minimize=True, name=None):
+    """Initializes ValidationMonitor.
+
+    Args:
+      x: matrix or tensor of shape [n_samples, n_features...]. Can be
+         iterator that returns arrays of features. The training input
+         samples for fitting the model. If set, `input_fn` must be `None`.
+      y: vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
+         iterator that returns array of targets. The training target values
+         (class labels in classification, real numbers in regression). If set,
+         `input_fn` must be `None`.
+      input_fn: Input function. If set, `x`, `y`, and `batch_size` must be
+          `None`.
+      batch_size: minibatch size to use on the input, defaults to first
+          dimension of `x`. Must be `None` if `input_fn` is provided.
+      every_n_steps: Runs this monitor every N steps.
+      metrics: Dict of metric ops to run. If None, the default metric functions
+        are used; if {}, no metrics are used.
+      early_stopping_rounds: If validation metric didn't go down for this many
+          steps, then stop training.
+      early_stopping_metric: `str`, name of the metric to early stop.
+      early_stopping_metric_minimize: `bool`, True if minimize, False
+          if maximize. For example, minimize `loss` or `mean_squared_error` and
+          maximize `accuracy` or `f1`.
+      name: `str`, appended to output sub-folder. If None uses `eval`
+          sub-folder, else, `eval-%name%` is used to save sum.
+
+    Raises:
+      ValueError: If both x and input_fn are provided.
+    """
    super(ValidationMonitor, self).__init__(every_n_steps=every_n_steps,
                                            first_n_steps=-1)
    if x is None and input_fn is None:
@ -194,25 +240,64 @@ class ValidationMonitor(EveryN):
    self.x = x
    self.y = y
    self.input_fn = input_fn
-    self.min_loss_step = 0
-    self.min_loss = None
+    self.batch_size = batch_size
+    self.metrics = metrics
    self.early_stopping_rounds = early_stopping_rounds
+    self.early_stopping_metric = early_stopping_metric
+    self.early_stopping_metric_minimize = early_stopping_metric_minimize
+    self.name = name
+    self._best_value_step = None
+    self._best_value = None
+    self._early_stopped = False
+    self._latest_path = None
+    self._latest_path_step = None
+
+  @property
+  def early_stopped(self):
+    return self._early_stopped
+
+  @property
+  def best_step(self):
+    return self._best_value_step
+
+  @property
+  def best_value(self):
+    return self._best_value

  def every_n_step_end(self, step, unused_outputs):
+    # Check that we are not running evaluation on the same checkpoint.
+    latest_path = saver.latest_checkpoint(self._estimator.model_dir)
+    if latest_path == self._latest_path:
+      logging.info("Skipping evaluation due to same checkpoint %s for step %d "
+                   "as for step %d.", latest_path, step, self._latest_path_step)
+      return False
+    self._latest_path = latest_path
+    self._latest_path_step = step
+
+    # Run evaluation and log it.
    outputs = self._estimator.evaluate(
-        x=self.x, y=self.y, input_fn=self.input_fn)
+        x=self.x, y=self.y, input_fn=self.input_fn, batch_size=self.batch_size,
+        metrics=self.metrics, name=self.name)
    stats = []
    for name in outputs:
      stats.append("%s = %s" % (name, str(outputs[name])))
    logging.info("Validation (step %d): %s" % (step, ", ".join(stats)))
+
+    # Early stopping logic.
    if self.early_stopping_rounds is not None:
-      if self.min_loss is None or outputs["loss"] < self.min_loss:
-        self.min_loss = outputs["loss"]
-        self.min_loss_step = step
-      stop_now = (step - self.min_loss_step >= self.early_stopping_rounds)
+      if (self._best_value is None or
+          (self.early_stopping_metric_minimize and
+           outputs[self.early_stopping_metric] < self._best_value) or
+          (not self.early_stopping_metric_minimize and
+           outputs[self.early_stopping_metric] > self._best_value)):
+        self._best_value = outputs[self.early_stopping_metric]
+        self._best_value_step = step
+      stop_now = (step - self._best_value_step >= self.early_stopping_rounds)
      if stop_now:
-        logging.info("Stopping. Best step: {} with loss {}."
-                     .format(self.min_loss_step, self.min_loss))
+        logging.info("Stopping. Best step: {} with {} = {}."
+                     .format(self._best_value_step,
+                             self.early_stopping_metric, self._best_value))
+        self._early_stopped = True
        return True
    return False

@ -220,7 +305,7 @@ class ValidationMonitor(EveryN):
 class CaptureVariable(EveryN):
  """Capture a variable value into a `list`.

-  It's useful for unit testing.
+  This monitor is useful for unit testing.
  """

  def __init__(self, var_name, every_n=100, first_n=1):
@ -239,9 +324,85 @@ def get_default_monitors(loss_op=None, summary_op=None, save_summary_steps=100,
                         output_dir=None, summary_writer=None):
  monitors = []
  if loss_op is not None:
-    monitors.append(PrintTensor([loss_op.name]))
+    monitors.append(PrintTensor(tensor_names={"loss": loss_op.name}))
  if summary_op is not None:
    monitors.append(SummarySaver(summary_op, save_steps=save_summary_steps,
                                 output_dir=output_dir,
                                 summary_writer=summary_writer))
  return monitors
+
+
+class GraphDump(BaseMonitor):
+  """Dumps almost all tensors in the graph at every step.
+
+  Note, this is very expensive, prefer `PrintTensor` or `CaptureVariable` if
+  you are not debugging.
+  """
+
+  IGNORE_OPS = ["Const", "Assign", "Identity", "Placeholder",
+                "RandomUniform", "Cast", "RestoreSlice"]
+
+  def __init__(self, ignore_ops=None):
+    """Initializes GraphDump monitor.
+
+    Args:
+      ignore_ops: `list` of string names of `Operation`s to ignore.
+          If `None` GraphDump.IGNORE_OPS list is used.
+    """
+    self.ignore_ops = ignore_ops or GraphDump.IGNORE_OPS
+    self._data = []
+
+  def begin(self, max_steps):
+    self.tensors = []
+    graph = ops.get_default_graph()
+    graph_def = graph.as_graph_def()
+    for node in graph_def.node:
+      if node.op in self.ignore_ops:
+        continue
+      try:
+        self.tensors.append(graph.get_tensor_by_name(node.name + ":0"))
+      except KeyError:
+        pass
+
+  def step_begin(self, step):
+    return self.tensors
+
+  def step_end(self, step, outputs):
+    self._data.append(outputs)
+
+  @property
+  def data(self):
+    return self._data
+
+  def compare(self, other_dump, step, atol=1e-06):
+    """Compares two `GraphDump` monitors and returns differences.
+
+    Args:
+      other_dump: Another `GraphDump` monitor.
+      step: `int`, step to compare on.
+      atol: `float`, absolute tolerance in comparison of floating arrays.
+
+    Returns:
+      Returns tuple:
+        matched: `list` of keys that matched.
+        non_matched: `dict` of keys to difference.
+    """
+    non_matched = {}
+    matched = []
+    for key in self.data[step]:
+      if not isinstance(key, str) and not isinstance(key, unicode):
+        continue
+      value1, value2 = self.data[step][key], other_dump.data[step][key]
+      if isinstance(value1, str):
+        continue
+      if isinstance(value1, np.ndarray):
+        if not np.allclose(value1, value2, atol=atol):
+          non_matched[key] = value1 - value2
+        else:
+          matched.append(key)
+      else:
+        if value1 != value2:
+          non_matched[key] = (value1, value2)
+        else:
+          matched.append(key)
+    return matched, non_matched
--- a/tensorflow/contrib/learn/python/learn/tests/early_stopping_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/early_stopping_test.py
@ -28,6 +28,12 @@ from tensorflow.contrib.learn.python.learn.estimators._sklearn import accuracy_s
 from tensorflow.contrib.learn.python.learn.estimators._sklearn import train_test_split


+def _get_summary_events(folder):
+  if not tf.gfile.Exists(folder):
+    raise ValueError('Folder %s doesn\'t exist.' % folder)
+  return tf.contrib.testing.latest_summaries(folder)
+
+
 class EarlyStoppingTest(tf.test.TestCase):
  """Early stopping tests."""

@ -35,36 +41,54 @@ class EarlyStoppingTest(tf.test.TestCase):
    random.seed(42)

    iris = datasets.load_iris()
-    x_train, x_test, y_train, y_test = train_test_split(iris.data,
-                                                        iris.target,
-                                                        test_size=0.2,
-                                                        random_state=42)
+    x_train, x_test, y_train, y_test = train_test_split(
+        iris.data, iris.target, test_size=0.2, random_state=42)

-    x_train, x_val, y_train, y_val = train_test_split(x_train,
-                                                      y_train,
-                                                      test_size=0.2)
-    val_monitor = learn.monitors.ValidationMonitor(x_val,
-                                                   y_val,
-                                                   early_stopping_rounds=100)
+    x_train, x_val, y_train, y_val = train_test_split(
+        x_train, y_train, test_size=0.2, random_state=42)
+    val_monitor = learn.monitors.ValidationMonitor(
+        x_val, y_val, every_n_steps=50, early_stopping_rounds=100,
+        early_stopping_metric='accuracy', early_stopping_metric_minimize=False)

    # classifier without early stopping - overfitting
-    classifier1 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
-                                                n_classes=3,
-                                                steps=1000)
+    classifier1 = learn.TensorFlowDNNClassifier(
+        hidden_units=[10, 20, 10], n_classes=3, steps=1000)
    classifier1.fit(x_train, y_train)
-    accuracy_score(y_test, classifier1.predict(x_test))
+    _ = accuracy_score(y_test, classifier1.predict(x_test))
+
+    # Full 1000 steps, 11 summaries and no evaluation summary.
+    # 11 summaries = first + every 100 out of 1000 steps.
+    self.assertEqual(11, len(_get_summary_events(classifier1.model_dir)))
+    with self.assertRaises(ValueError):
+      _get_summary_events(classifier1.model_dir + '/eval')

    # classifier with early stopping - improved accuracy on testing set
-    classifier2 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
-                                                n_classes=3,
-                                                steps=1000)
+    classifier2 = learn.TensorFlowDNNClassifier(
+        hidden_units=[10, 20, 10], n_classes=3, steps=2000,
+        config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))

    classifier2.fit(x_train, y_train, monitors=[val_monitor])
-    accuracy_score(y_test, classifier2.predict(x_test))
+    _ = accuracy_score(y_val, classifier2.predict(x_val))
+    _ = accuracy_score(y_test, classifier2.predict(x_test))
+
+    # Note, this test is unstable, so not checking for equality.
+    # See stability_test for examples of stability issues.
+    if val_monitor.early_stopped:
+      self.assertLess(val_monitor.best_step, 2000)
+      # Note, due to validation monitor stopping after the best score occur,
+      # the accuracy at current checkpoint is less.
+      # TODO(ipolosukhin): Time machine for restoring old checkpoints?
+      # flaky, still not always best_value better then score2 value.
+      # self.assertGreater(val_monitor.best_value, score2_val)
+
+      # Early stopped, unstable so checking only < then max.
+      self.assertLess(len(_get_summary_events(classifier2.model_dir)), 21)
+      self.assertLess(len(_get_summary_events(
+          classifier2.model_dir + '/eval')), 4)

    # TODO(ipolosukhin): Restore this?
    # self.assertGreater(score2, score1, "No improvement using early stopping.")


-if __name__ == "__main__":
+if __name__ == '__main__':
  tf.test.main()
--- a/tensorflow/contrib/learn/python/learn/tests/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/experiment_test.py
@ -0,0 +1,119 @@
+# pylint: disable=g-bad-file-header
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Tests for TaskRunner and Experiment class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import tensorflow as tf
+
+
+class TestEstimator(object):
+
+  def __init__(self):
+    self.eval_count = 0
+    self.fit_count = 0
+
+  def evaluate(self, **kwargs):
+    tf.logging.info('evaluate called with args: %s' % kwargs)
+    self.eval_count += 1
+    if self.eval_count > 5:
+      tf.logging.info('Ran 6 evals. Done.')
+      raise StopIteration()
+    return [(key, kwargs[key]) for key in sorted(kwargs.keys())]
+
+  def fit(self, **kwargs):
+    tf.logging.info('fit called with args: %s' % kwargs)
+    self.fit_count += 1
+    return [(key, kwargs[key]) for key in sorted(kwargs.keys())]
+
+
+class ExperimentTest(tf.test.TestCase):
+
+  def test_train(self):
+    est = TestEstimator()
+    ex = tf.contrib.learn.Experiment(est,
+                                     train_input_fn='train_input',
+                                     eval_input_fn='eval_input',
+                                     eval_metrics='eval_metrics')
+    ex.train(delay_secs=0)
+    self.assertEquals(1, est.fit_count)
+    self.assertEquals(0, est.eval_count)
+
+  def test_train_delay(self):
+    est = TestEstimator()
+    ex = tf.contrib.learn.Experiment(est,
+                                     train_input_fn='train_input',
+                                     eval_input_fn='eval_input')
+    for delay in [0, 1, 3]:
+      start = time.time()
+      ex.train(delay_secs=delay)
+      duration = time.time() - start
+      tf.logging.info('train duration (expected %f): %f', delay, duration)
+      self.assertTrue(duration > delay - 0.5 and duration < delay + 0.5)
+
+  def test_evaluate(self):
+    est = TestEstimator()
+    ex = tf.contrib.learn.Experiment(est,
+                                     train_input_fn='train_input',
+                                     eval_input_fn='eval_input',
+                                     eval_metrics='eval_metrics')
+    ex.evaluate(steps='steps', delay_secs=0)
+    self.assertEquals(1, est.eval_count)
+    self.assertEquals(0, est.fit_count)
+
+  def test_evaluate_delay(self):
+    est = TestEstimator()
+    ex = tf.contrib.learn.Experiment(est,
+                                     train_input_fn='train_input',
+                                     eval_input_fn='eval_input')
+    for delay in [0, 1, 3]:
+      start = time.time()
+      ex.evaluate(delay_secs=delay)
+      duration = time.time() - start
+      tf.logging.info('eval duration (expected %f): %f', delay, duration)
+      self.assertTrue(duration > delay - 0.5 and duration < delay + 0.5)
+
+  def test_continuous_eval(self):
+    est = TestEstimator()
+    ex = tf.contrib.learn.Experiment(est,
+                                     train_input_fn='train_input',
+                                     eval_input_fn='eval_input',
+                                     eval_metrics='eval_metrics')
+    self.assertRaises(StopIteration, ex.continuous_eval,
+                      delay_secs=0, throttle_delay_secs=0)
+    self.assertEquals(6, est.eval_count)
+    self.assertEquals(0, est.fit_count)
+
+  def test_continuous_eval_throttle_delay(self):
+    for delay in [0, 1, 2]:
+      est = TestEstimator()
+      ex = tf.contrib.learn.Experiment(est,
+                                       train_input_fn='train_input',
+                                       eval_input_fn='eval_input',
+                                       eval_metrics='eval_metrics')
+      start = time.time()
+      self.assertRaises(StopIteration, ex.continuous_eval,
+                        delay_secs=0, throttle_delay_secs=delay)
+      duration = time.time() - start
+      expected = 5 * delay
+      tf.logging.info('eval duration (expected %f): %f', expected, duration)
+      self.assertTrue(duration > expected - 0.5 and duration < expected + 0.5)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py
@ -0,0 +1,107 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""learn_main tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib.learn.python.learn import learn_runner
+
+
+FLAGS = learn_runner.FLAGS
+
+
+class TestExperiment(tf.contrib.learn.Experiment):
+
+  def __init__(self, default=None):
+    self.default = default
+
+  def simple_task(self):
+    return "simple_task, default=%s." % self.default
+
+
+# pylint: disable=unused-argument
+def build_experiment(output_dir):
+  tf.logging.info("In default build_experiment.")
+  return TestExperiment()
+
+
+def build_non_experiment(output_dir):
+  return "Ceci n'est pas un Experiment."
+# pylint: enable=unused-argument
+
+
+class MainTest(tf.test.TestCase):
+
+  def setUp(self):
+    # Make sure the flags exist. It's unclear why this is necessary.
+    if not hasattr(FLAGS, "output_dir"):
+      learn_runner.flags.DEFINE_string("output_dir", "/tmp", "Fake")
+    if not hasattr(FLAGS, "schedule"):
+      learn_runner.flags.DEFINE_string("schedule", "simple_task", "Fake")
+
+  def test_run(self):
+    FLAGS.output_dir = "/tmp"
+    FLAGS.schedule = "simple_task"
+    self.assertEqual("simple_task, default=None.",
+                     learn_runner.run(build_experiment))
+
+  def test_fail_no_output_dir(self):
+    FLAGS.output_dir = ""
+    FLAGS.schedule = "simple_test"
+    self.assertRaisesRegexp(RuntimeError,
+                            "Must specify an output directory",
+                            learn_runner.run, build_experiment)
+
+  def test_fail_no_schedule(self):
+    FLAGS.output_dir = "/tmp"
+    FLAGS.schedule = ""
+    self.assertRaisesRegexp(RuntimeError, "Must specify a schedule",
+                            learn_runner.run, build_experiment)
+
+  def test_fail_non_callable(self):
+    FLAGS.output_dir = "/tmp"
+    FLAGS.schedule = "simple_test"
+    self.assertRaisesRegexp(TypeError,
+                            "Experiment builder .* is not callable",
+                            learn_runner.run, "not callable")
+
+  def test_fail_not_experiment(self):
+    FLAGS.output_dir = "/tmp"
+    FLAGS.schedule = "simple_test"
+    self.assertRaisesRegexp(
+        TypeError, "Experiment builder did not return an Experiment",
+        learn_runner.run, build_non_experiment)
+
+  def test_fail_non_existent_task(self):
+    FLAGS.output_dir = "/tmp"
+    FLAGS.schedule = "mirage"
+    self.assertRaisesRegexp(
+        ValueError, "Schedule references non-existent task",
+        learn_runner.run, build_experiment)
+
+  def test_fail_non_callable_task(self):
+    FLAGS.output_dir = "/tmp"
+    FLAGS.schedule = "default"
+    self.assertRaisesRegexp(
+        TypeError, "Schedule references non-callable member",
+        learn_runner.run, build_experiment)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/tensorflow/contrib/learn/python/learn/tests/stability_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/stability_test.py
@ -0,0 +1,88 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Non-linear estimator tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# import random
+
+import tensorflow as tf
+
+
+class StabilityTest(tf.test.TestCase):
+  """Tests that estiamtors are reproducible."""
+
+  def testRandomStability(self):
+    my_seed, minval, maxval = 42, -0.3333, 0.3333
+    with tf.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        tf.set_random_seed(my_seed)
+        x = tf.random_uniform([10, 10], minval=minval, maxval=maxval)
+        val1 = session.run(x)
+    with tf.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        tf.set_random_seed(my_seed)
+        x = tf.random_uniform([10, 10], minval=minval, maxval=maxval)
+        val2 = session.run(x)
+    self.assertAllClose(val1, val2)
+
+  def testLinearRegression(self):
+    # TODO(ipolosukhin): This doesn't pass at all, but should...
+    pass
+#     random.seed(42)
+#     boston = tf.contrib.learn.datasets.load_boston()
+#     regressor = tf.contrib.learn.LinearRegressor()
+#     regressor.fit(x=boston.data, y=boston.target, steps=1)
+#     regressor2 = tf.contrib.learn.LinearRegressor()
+#     regressor2.fit(x=boston.data, y=boston.target, steps=1)
+#     self.assertAllClose(regressor.weights_, regressor2.weights_)
+#     self.assertAllClose(regressor.bias_, regressor2.bias_)
+#     self.assertAllClose(regressor.predict(boston.data),
+#                         regressor2.predict(boston.data), atol=1e-05)
+
+  def testDNNRegression(self):
+    # TODO(ipolosukhin): This doesn't pass at all, but should...
+    # Either bugs or just general instability.
+    pass
+#     random.seed(42)
+#     boston = tf.contrib.learn.datasets.load_boston()
+#     regressor = tf.contrib.learn.DNNRegressor(
+#         hidden_units=[10],
+#         optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.001))
+#     graph_dump = tf.contrib.learn.monitors.GraphDump()
+#     regressor.fit(x=boston.data, y=boston.target, steps=1,
+#                   monitors=[graph_dump], batch_size=1)
+#     regressor2 = tf.contrib.learn.DNNRegressor(
+#         hidden_units=[10],
+#         optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.001))
+#     graph_dump2 = tf.contrib.learn.monitors.GraphDump()
+#     regressor2.fit(x=boston.data, y=boston.target, steps=1,
+#                    monitors=[graph_dump2], batch_size=1)
+#     _, non_match = graph_dump.compare(graph_dump2, 0, atol=1e-02)
+#     self.assertEmpty(non_match.keys())
+#     for weight1, weight2 in zip(regressor.weights_, regressor2.weights_):
+#       self.assertAllClose(weight1, weight2)
+#     for bias1, bias2 in zip(regressor.bias_, regressor2.bias_):
+#       self.assertAllClose(bias1, bias2)
+#     self.assertAllClose(regressor.predict(boston.data),
+#                         regressor2.predict(boston.data), atol=1e-05)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/graph/quantize_training.cc
@ -0,0 +1,229 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <atomic>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/graph/quantize_training.h"
+
+#include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/memory_types.h"
+#include "tensorflow/core/framework/log_memory.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+// Node types to rewrite. Insert quantize_and_dequantize op for their inputs.
+const std::unordered_set<string, StringPiece::Hasher> nodes_to_rewrite{
+    "MatMul", "Conv2D"};
+
+// Contains necessary parameters to convert an edge.
+struct EdgeToConvert {
+  // Edge is not owned here.
+  const Edge* edge;
+  int32 num_bits;
+  bool signed_input;
+  bool range_given;
+  float input_min;
+  float input_max;
+
+  EdgeToConvert(const Edge* e, int32 bits, bool sign, bool range, float min,
+                float max) {
+    edge = e;
+    num_bits = bits;
+    signed_input = sign;
+    range_given = range;
+    input_min = min;
+    input_max = max;
+  }
+};
+
+// Decide if a node is in backward pass by checking if its name is led by
+// "gradients".
+// TODO(jmchen): Make this check more robust as it is not guaranteed that the
+// forward node will not be named with a leading "gradients".
+inline bool IsGradientNode(const Graph* graph, const Node* node) {
+  static const string tag = "gradients";
+  return (node->name().compare(0, tag.size(), tag) == 0);
+}
+
+// Find the type of the input to set the parameters for the
+// quantize_and_dequantize op.
+// Returns true if the root tensor op type is known, false otherwise.
+bool FindType(const Graph* graph, const Node* node, bool* signed_input,
+              bool* range_given, float* input_min, float* input_max) {
+  const string src_op = node->type_string();
+  if (src_op == "Const" || src_op == "Variable") {
+    *signed_input = true;
+    *range_given = false;
+  } else if (src_op == "Relu") {
+    // Range is not given for Relu.
+    *signed_input = false;
+    *range_given = false;
+  } else if (src_op == "Relu6") {
+    *signed_input = false;
+    *range_given = true;
+    *input_min = 0;
+    *input_max = 6;
+  } else if (src_op == "Sigmoid") {
+    *signed_input = false;
+    *range_given = true;
+    *input_min = 0;
+    *input_max = 1;
+  } else if (src_op == "Tanh") {
+    *signed_input = true;
+    *range_given = true;
+    *input_min = -1;
+    *input_max = 1;
+  } else if (src_op == "Reshape") {
+    // Reshape has 2 inputs and the first one is the tensor.
+    for (const Edge* edge : node->in_edges()) {
+      if (edge->src_output() != Graph::kControlSlot && edge->dst_input() == 0) {
+        FindType(graph, edge->src(), signed_input, range_given, input_min,
+                 input_max);
+      }
+    }
+  } else if (src_op == "Identity" || src_op == "MaxPool" ||
+             src_op == "AvgPool" || src_op == "MaxPool3D" ||
+             src_op == "AvgPool3D") {
+    // All these Ops only have 1 data input.
+    for (const Edge* edge : node->in_edges()) {
+      if (edge->src_output() != Graph::kControlSlot) {
+        FindType(graph, edge->src(), signed_input, range_given, input_min,
+                 input_max);
+      }
+    }
+  } else {
+    // Unknown type, could be the model input examples.
+    // TODO: Set the params for input with user's hint.
+    *signed_input = true;
+    *range_given = false;
+    return false;
+  }
+
+  return true;
+}
+
+// Insert conversion op, connect it to the graph and remove the old edge.
+Status ProcessTargetEdges(Graph* graph,
+                          const std::vector<EdgeToConvert>& target_edges) {
+  // Remember previous convert ops to avoid duplicated conversion on the same
+  // input.
+  std::unordered_map<string, Node*, StringPiece::Hasher> name_index;
+  for (const EdgeToConvert edge : target_edges) {
+    Node* convert_node;
+    string name =
+        strings::StrCat(edge.edge->src()->name(), "/_QuantizeAndDequantize");
+
+    auto iter = name_index.find(name);
+    if (iter == name_index.end()) {
+      TF_RETURN_IF_ERROR(NodeBuilder(name, "_QuantizeAndDequantize")
+                             .Input(edge.edge->src())
+                             .Attr("signed_input", edge.signed_input)
+                             .Attr("num_bits", edge.num_bits)
+                             .Attr("range_given", edge.range_given)
+                             .Attr("input_min", edge.input_min)
+                             .Attr("input_max", edge.input_max)
+                             .Finalize(graph, &convert_node));
+
+      name_index[name] = convert_node;
+    } else {
+      convert_node = iter->second;
+    }
+
+    graph->AddEdge(convert_node, 0, edge.edge->dst(), edge.edge->dst_input());
+    graph->RemoveEdge(edge.edge);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+Status DoQuantizeTraining(int32 num_bits, Graph* graph) {
+  if (graph == nullptr) {
+    return errors::InvalidArgument("Cannot accept empty graph pointer.");
+  }
+
+  if (num_bits < 1 || num_bits > 63) {
+    return errors::OutOfRange("num_bits should be in range [1, 63] but is: ",
+                              num_bits);
+  }
+  int potential_input = 0;
+  std::vector<EdgeToConvert> target_edges;
+  for (Node* node : graph->nodes()) {
+    if (nodes_to_rewrite.find(node->type_string()) != nodes_to_rewrite.end() &&
+        !IsGradientNode(graph, node)) {
+      // Find out which types are the inputs and convert them accordingly.
+      // 1. Const/Variable OP: This is quantized as signed tensors with no given
+      // range.
+      // 2. Activation OP: Set the range accordingly for different types of
+      // activations. Currently we handle {Relu, Relu6, Sigmoid, Tanh}
+      // 3. Identity OP: The quantization parameters depend on its input.
+      // 4. Pooling OPs: various pooling ops. Also depends on its input.
+      // 5. Reshape OP: Also depends on the first input to this op.
+      // 6. Not-Listed-Above OP: If there is only 1 such op, consider it as the
+      // model input. However, if there are >1 unknown ops, then returns an
+      // error for now to avoid unexpected bahavior.
+      // Note: The list above might not be a complete list. Please let us
+      // know if you see the error so we can handle your case.
+      for (const Edge* edge : node->in_edges()) {
+        if (edge->src_output() == Graph::kControlSlot) {
+          // Skip the control dependency input.
+          continue;
+        } else {
+          bool signed_input = false;
+          bool range_given = false;
+          float input_min = 0;
+          float input_max = 0;
+          bool known_op = FindType(graph, edge->src(), &signed_input,
+                                   &range_given, &input_min, &input_max);
+          if (!known_op) {
+            // Unknown op is considered as input.
+            // Only support one input for now.
+            // TODO: Make this configurable if this is the desirable way to find
+            // input.
+            if (potential_input > 0) {
+              return errors::Unimplemented(
+                  "Find a second unknown op: ", edge->src()->name(),
+                  " with type: ", edge->src()->type_string(),
+                  "; Unknown ops are considered as model input for now and "
+                  "only 1 input is supported currently.");
+            }
+            potential_input++;
+          }
+
+          target_edges.emplace_back(EdgeToConvert(
+              edge, num_bits, signed_input, range_given, input_min, input_max));
+        }
+      }
+    }
+  }
+
+  TF_RETURN_IF_ERROR(ProcessTargetEdges(graph, target_edges));
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
--- a/tensorflow/core/graph/quantize_training.h
+++ b/tensorflow/core/graph/quantize_training.h
@ -0,0 +1,37 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_
+#define TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// Rewrites graph for quantized training.
+// Rewrites the forward pass to include the precision loss with quantization so
+// the model can learn to deal with such loss and achieve better accuracy when
+// it is quantized later for inference.
+// Note that the num_bits should be in [1, 63] and 'g' must be not null.
+//
+// On success, returns OK.
+//
+// On failure, returns the error status. Possible errors include:
+//    - num_bits out of range.
+//    - g is null.
+//    - More than 1 unknown ops encountered.
+Status DoQuantizeTraining(int32 num_bits, Graph* g);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_
--- a/tensorflow/core/graph/quantize_training_test.cc
+++ b/tensorflow/core/graph/quantize_training_test.cc
@ -0,0 +1,161 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/graph/quantize_training.h"
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+class QuantizeTrainingTest : public ::testing::Test {
+ protected:
+  QuantizeTrainingTest() { Reset(); }
+  void Reset() { g_.reset(new Graph(OpRegistry::Global())); }
+
+  template <typename T>
+  Node* Constant(gtl::ArraySlice<T> values, TensorShape shape) {
+    return test::graph::Constant(g_.get(), test::AsTensor(values, shape));
+  }
+
+  std::unique_ptr<Graph> g_;
+};
+
+TEST_F(QuantizeTrainingTest, NormalGraph) {
+  // Construct the following graph
+  /*
+           m1      m2
+        /      \ /     \
+      Relu   Identity   c
+        |       |
+        a       b
+  */
+  Reset();
+  Graph* g = g_.get();
+  Node* a = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* c = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
+  g->AddControlEdge(g->source_node(), a);
+  g->AddControlEdge(g->source_node(), b);
+  g->AddControlEdge(g->source_node(), c);
+  Node* relu = test::graph::Relu(g, a);
+  Node* identity = test::graph::Identity(g, b);
+  Node* m1 = test::graph::Matmul(g, relu, identity, false, false);
+  Node* m2 = test::graph::Matmul(g, identity, c, false, false);
+  g->AddControlEdge(m1, g->sink_node());
+  g->AddControlEdge(m2, g->sink_node());
+
+  // The graph after the rewriting should be:
+  // "Q" is the quantize_and_dequantize op.
+  // Note the Q in the middle is shared by both m1 and m2.
+  /*
+         m1       m2
+      /      \ /     \
+      Q       Q       Q
+      |       |       |
+    Relu   Identity   c
+      |       |
+      a       b
+  */
+  int num_bits = 8;
+  // 4 edges to modify
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
+
+  // There should be 12 nodes in total including the source and sink nodes.
+  EXPECT_EQ(12, g->num_nodes());
+  // Nodes m1 and m2's inputs should be the quantize_and_dequantize op.
+  std::vector<Node*> target_nodes{m1, m2};
+  for (Node* n : target_nodes) {
+    for (Node* in : n->in_nodes()) {
+      EXPECT_EQ("_QuantizeAndDequantize", in->type_string());
+    }
+  }
+
+  // relu, identity, c should now connect to the quantize_and_dequantize nodes.
+  std::vector<Node*> target_inputs{relu, identity, c};
+  for (Node* n : target_inputs) {
+    for (Node* out : n->out_nodes()) {
+      EXPECT_EQ("_QuantizeAndDequantize", out->type_string());
+    }
+  }
+
+  // Quantize_and_dequantize node for identity should have signed_input==true.
+  NodeDef identity_Q = identity->out_nodes().begin()->def();
+  ASSERT_EQ("true",
+            SummarizeAttrValue(identity_Q.attr().find("signed_input")->second));
+  // Quantize_and_dequantize node for relu should have signed_input==false.
+  NodeDef relu_Q = relu->out_nodes().begin()->def();
+  ASSERT_EQ("false",
+            SummarizeAttrValue(relu_Q.attr().find("signed_input")->second));
+}
+
+TEST_F(QuantizeTrainingTest, WithBackwardNodes) {
+  // Construct the same graph plus another backward Matmul.
+  Reset();
+  Graph* g = g_.get();
+  Node* a = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* c = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
+  g->AddControlEdge(g->source_node(), a);
+  g->AddControlEdge(g->source_node(), b);
+  g->AddControlEdge(g->source_node(), c);
+  Node* relu = test::graph::Relu(g, a);
+  Node* identity = test::graph::Identity(g, b);
+  Node* m1 = test::graph::Matmul(g, relu, identity, false, false);
+  Node* m2 = test::graph::Matmul(g, identity, c, false, false);
+  g->AddControlEdge(m1, g->sink_node());
+  g->AddControlEdge(m2, g->sink_node());
+
+  // Add a Matmul node with name starting with "gradients".
+  Node* backward_m;
+  TF_ASSERT_OK(NodeBuilder(g->NewName("gradients/n"), "MatMul")
+                   .Input(m1)
+                   .Input(m2)
+                   .Attr("transpose_a", true)
+                   .Attr("transpose_b", false)
+                   .Finalize(g, &backward_m));
+  g->AddControlEdge(backward_m, g->sink_node());
+
+  int num_bits = 8;
+  // Still 4 changes since the inputs of backward node will not be converted.
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
+
+  // Nodes m1 and m2's inputs should now be the quantize_and_dequantize op.
+  EXPECT_EQ(13, g->num_nodes());
+  EXPECT_EQ(2, m2->num_inputs());
+}
+
+#undef SIMPLE_GRAPH
+
+}  // namespace
+}  // namespace tensorflow
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@ -384,6 +384,15 @@ Node* GetSessionTensor(Graph* g, Node* in) {
  return ret;
 }

+Node* Relu(Graph* g, Node* in) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Relu")
+                  .Input(in, 0)
+                  .Attr("T", DT_FLOAT)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 void ToGraphDef(Graph* g, GraphDef* gdef) { g->ToGraphDef(gdef); }

 }  // end namespace graph
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@ -169,6 +169,9 @@ Node* GetSessionTensor(Graph* g, Node* in);
 // given in "tensors".
 Node* Concat(Graph* g, Node* concat_dim, gtl::ArraySlice<Node*> tensors);

+// Add a Relu node in "g".
+Node* Relu(Graph* g, Node* in);
+
 }  // end namespace graph
 }  // end namespace test
 }  // end namespace tensorflow
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@ -23,6 +23,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@ -99,10 +100,12 @@ class AvgPoolingOp : public UnaryOp<T> {
  TensorFormat data_format_;
 };

-REGISTER_KERNEL_BUILDER(Name("AvgPool")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        AvgPoolingOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    AvgPoolingOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
+    AvgPoolingOp<CPUDevice, Eigen::half>);

 #if GOOGLE_CUDA
 template <typename T>
@ -181,14 +184,17 @@ namespace functor {
      const Eigen::PaddingType& padding);                        \
  extern template struct SpatialAvgPooling<GPUDevice, T>;

+DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor

-REGISTER_KERNEL_BUILDER(Name("AvgPool")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T"),
-                        AvgPoolingOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+    AvgPoolingOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(
+    Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<float>("T"),
+    AvgPoolingOp<GPUDevice, float>);
 #endif  // GOOGLE_CUDA

 // The operation to compute AvgPool gradients.
@ -300,7 +306,7 @@ class AvgPoolingGradOp : public OpKernel {
                           GetBroadcastSize(c, in_cols, window_cols, col_stride,
                                            pad_cols, &cindex, &csize));

-            T divide_coeff = 1.0 / (rsize * csize);
+            T divide_coeff(1.0 / (rsize * csize));
            int64 output_index =
                (b * out_backprop_rows + r) * out_backprop_cols + c;
            for (int64 r_dst = rindex; r_dst < rindex + rsize; ++r_dst) {
@ -337,16 +343,16 @@ class AvgPoolingGradOp : public OpKernel {
  TensorFormat data_format_;
 };

-REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("orig_input_shape"),
-                        AvgPoolingGradOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T")
-                            .HostMemory("orig_input_shape"),
-                        AvgPoolingGradOp<CPUDevice, double>);
+#define REGISTER_CPU_KERNEL(T)                                 \
+  REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")                  \
+                              .Device(DEVICE_CPU)              \
+                              .TypeConstraint<T>("T")          \
+                              .HostMemory("orig_input_shape"), \
+                          AvgPoolingGradOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+TF_CALL_half(REGISTER_CPU_KERNEL);

 #if GOOGLE_CUDA

@ -416,6 +422,12 @@ REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
                            .HostMemory("orig_input_shape")
                            .Label("cudnn"),
                        AvgPoolingGradOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::half>("T")
+                            .HostMemory("orig_input_shape")
+                            .Label("cudnn"),
+                        AvgPoolingGradOp<GPUDevice, Eigen::half>);

 // A custom GPU kernel based AvgPoolingGrad implementation. It includes the
 // padding as the candidates for the pooling operation.
@ -532,6 +544,11 @@ REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
                            .TypeConstraint<float>("T")
                            .HostMemory("orig_input_shape"),
                        AvgPoolingGradOpCustomGPUKernel<float>);
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::half>("T")
+                            .HostMemory("orig_input_shape"),
+                        AvgPoolingGradOpCustomGPUKernel<Eigen::half>);

 #endif  // GOOGLE_CUDA

--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@ -33,6 +33,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #define DEFINE_GPU_KERNELS(T) \
  template struct functor::SpatialAvgPooling<GPUDevice, T>;

+DEFINE_GPU_KERNELS(Eigen::half)
 DEFINE_GPU_KERNELS(float)

 #undef DEFINE_GPU_KERNELS
@ -57,7 +58,7 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
    const int phend = min(h / stride_h + 1, pooled_height);
    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
    const int pwend = min(w / stride_w + 1, pooled_width);
-    dtype gradient = 0;
+    dtype gradient(0);
    const dtype* const top_diff_slice =
        top_diff + n * pooled_height * pooled_width * channels + c;
    for (int ph = phstart; ph < phend; ++ph) {
@ -104,6 +105,12 @@ template bool RunAvePoolBackwardNHWC(
    const int pooled_width, const int kernel_h, const int kernel_w,
    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
    float* const bottom_diff, const GPUDevice& d);
+template bool RunAvePoolBackwardNHWC(
+    const Eigen::half* const top_diff, const int num, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    Eigen::half* const bottom_diff, const GPUDevice& d);

 }  // end namespace tensorflow

--- a/tensorflow/core/kernels/batch_norm_op.cc
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@ -159,9 +159,9 @@ class BatchNormGradOp : public OpKernel {
                              .TypeConstraint<T>("T"),             \
                          BatchNormOp<CPUDevice, T>);

-REGISTER_KERNEL(Eigen::half);
-REGISTER_KERNEL(float);
-REGISTER_KERNEL(double);
+TF_CALL_half(REGISTER_KERNEL);
+TF_CALL_float(REGISTER_KERNEL);
+TF_CALL_double(REGISTER_KERNEL);
 #undef REGISTER_KERNEL

 #if GOOGLE_CUDA
@ -179,8 +179,8 @@ namespace functor {

 #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);

-DECLARE_GPU_SPECS(Eigen::half);
-DECLARE_GPU_SPECS(float);
+TF_CALL_half(DECLARE_GPU_SPECS);
+TF_CALL_float(DECLARE_GPU_SPECS);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor

@ -191,8 +191,8 @@ DECLARE_GPU_SPECS(float);
                              .TypeConstraint<T>("T"),             \
                          BatchNormOp<GPUDevice, T>);

-REGISTER_GPU_KERNEL(Eigen::half);
-REGISTER_GPU_KERNEL(float);
+TF_CALL_half(REGISTER_GPU_KERNEL);
+TF_CALL_float(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL

 #endif  // GOOGLE_CUDA
@ -203,9 +203,9 @@ REGISTER_GPU_KERNEL(float);
                              .TypeConstraint<T>("T"),                 \
                          BatchNormGradOp<CPUDevice, T>);

-REGISTER_KERNEL(Eigen::half);
-REGISTER_KERNEL(float);
-REGISTER_KERNEL(double);
+TF_CALL_half(REGISTER_KERNEL);
+TF_CALL_float(REGISTER_KERNEL);
+TF_CALL_double(REGISTER_KERNEL);
 #undef REGISTER_KERNEL

 #if GOOGLE_CUDA
@ -226,8 +226,8 @@ namespace functor {

 #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);

-DECLARE_GPU_SPECS(Eigen::half);
-DECLARE_GPU_SPECS(float);
+TF_CALL_half(DECLARE_GPU_SPECS);
+TF_CALL_float(DECLARE_GPU_SPECS);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor

@ -238,8 +238,8 @@ DECLARE_GPU_SPECS(float);
                              .TypeConstraint<T>("T"),                 \
                          BatchNormGradOp<GPUDevice, T>);

-REGISTER_GPU_KERNEL(Eigen::half);
-REGISTER_GPU_KERNEL(float);
+TF_CALL_half(REGISTER_GPU_KERNEL);
+TF_CALL_float(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL

 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/check_numerics_op.cc
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@ -20,6 +20,7 @@ limitations under the License.
 #include <numeric>

 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"

@ -182,18 +183,14 @@ class CheckNumericsOp<GPUDevice, T> : public OpKernel {

 }  // namespace

-REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<Eigen::half>("T"),
-                        CheckNumericsOp<CPUDevice, Eigen::half>);
-REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        CheckNumericsOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T"),
-                        CheckNumericsOp<CPUDevice, double>);
+#define REGISTER_CPU_KERNEL(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("CheckNumerics").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      CheckNumericsOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU_KERNEL);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
                            .Device(DEVICE_GPU)
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@ -622,35 +623,24 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropInputOp);
 };

-REGISTER_KERNEL_BUILDER(
-    Name("Conv2DBackpropInput").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Conv2DCustomBackpropInputOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<Eigen::half>("T"),
-                        Conv2DCustomBackpropInputOp<CPUDevice, Eigen::half>);
+#define REGISTER_CPU_KERNELS(T)                                              \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("Conv2DBackpropInput").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv2DCustomBackpropInputOp<CPUDevice, T>);                            \
+  REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")                        \
+                              .Device(DEVICE_CPU)                            \
+                              .Label("custom")                               \
+                              .TypeConstraint<T>("T"),                       \
+                          Conv2DCustomBackpropInputOp<CPUDevice, T>);        \
+  REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")                        \
+                              .Device(DEVICE_CPU)                            \
+                              .Label("eigen_tensor")                         \
+                              .TypeConstraint<T>("T"),                       \
+                          Conv2DFastBackpropInputOp<CPUDevice, T>);

-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
-                            .Device(DEVICE_CPU)
-                            .Label("custom")
-                            .TypeConstraint<float>("T"),
-                        Conv2DCustomBackpropInputOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
-                            .Device(DEVICE_CPU)
-                            .Label("custom")
-                            .TypeConstraint<Eigen::half>("T"),
-                        Conv2DCustomBackpropInputOp<CPUDevice, Eigen::half>);
-
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
-                            .Device(DEVICE_CPU)
-                            .Label("eigen_tensor")
-                            .TypeConstraint<float>("T"),
-                        Conv2DFastBackpropInputOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
-                            .Device(DEVICE_CPU)
-                            .Label("eigen_tensor")
-                            .TypeConstraint<Eigen::half>("T"),
-                        Conv2DFastBackpropInputOp<CPUDevice, Eigen::half>);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS

 template <typename Device, class T>
 class Conv2DFastBackpropFilterOp : public OpKernel {
@ -867,35 +857,24 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropFilterOp);
 };

-REGISTER_KERNEL_BUILDER(
-    Name("Conv2DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Conv2DCustomBackpropFilterOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<Eigen::half>("T"),
-                        Conv2DCustomBackpropFilterOp<CPUDevice, Eigen::half>);
+#define REGISTER_CPU_KERNELS(T)                                               \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Conv2DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv2DCustomBackpropFilterOp<CPUDevice, T>);                            \
+  REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")                        \
+                              .Device(DEVICE_CPU)                             \
+                              .Label("custom")                                \
+                              .TypeConstraint<T>("T"),                        \
+                          Conv2DCustomBackpropFilterOp<CPUDevice, T>);        \
+  REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")                        \
+                              .Device(DEVICE_CPU)                             \
+                              .Label("eigen_tensor")                          \
+                              .TypeConstraint<T>("T"),                        \
+                          Conv2DFastBackpropFilterOp<CPUDevice, T>);

-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
-                            .Device(DEVICE_CPU)
-                            .Label("custom")
-                            .TypeConstraint<float>("T"),
-                        Conv2DCustomBackpropFilterOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
-                            .Device(DEVICE_CPU)
-                            .Label("custom")
-                            .TypeConstraint<Eigen::half>("T"),
-                        Conv2DCustomBackpropFilterOp<CPUDevice, Eigen::half>);
-
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
-                            .Device(DEVICE_CPU)
-                            .Label("eigen_tensor")
-                            .TypeConstraint<float>("T"),
-                        Conv2DFastBackpropFilterOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
-                            .Device(DEVICE_CPU)
-                            .Label("eigen_tensor")
-                            .TypeConstraint<Eigen::half>("T"),
-                        Conv2DFastBackpropFilterOp<CPUDevice, Eigen::half>);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS

 // GPU definitions of both ops.
 #if GOOGLE_CUDA
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@ -20,6 +20,7 @@ limitations under the License.

 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@ -194,14 +195,13 @@ class Conv3DBackpropInputOp : public OpKernel {
  Padding padding_;
 };

-REGISTER_KERNEL_BUILDER(
-    Name("Conv3DBackpropInput").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Conv3DBackpropInputOp<CPUDevice, float>);
-#ifndef IS_MOBILE_PLATFORM
-REGISTER_KERNEL_BUILDER(
-    Name("Conv3DBackpropInput").Device(DEVICE_CPU).TypeConstraint<double>("T"),
-    Conv3DBackpropInputOp<CPUDevice, double>);
-#endif
+#define REGISTER_CPU_KERNEL(T)                                               \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("Conv3DBackpropInput").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv3DBackpropInputOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL

 // Backprop for filter.
 template <typename Device, class T>
@ -303,14 +303,13 @@ class Conv3DBackpropFilterOp : public OpKernel {
  Padding padding_;
 };

-REGISTER_KERNEL_BUILDER(
-    Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Conv3DBackpropFilterOp<CPUDevice, float>);
-#ifndef IS_MOBILE_PLATFORM
-REGISTER_KERNEL_BUILDER(
-    Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<double>("T"),
-    Conv3DBackpropFilterOp<CPUDevice, double>);
-#endif
+#define REGISTER_CPU_KERNEL(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv3DBackpropFilterOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL

 // GPU definitions of both ops.
 #if GOOGLE_CUDA
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@ -245,12 +246,13 @@ class Conv2DOp : public BinaryOp<T> {
  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp);
 };

-REGISTER_KERNEL_BUILDER(
-    Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Conv2DOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
-    Conv2DOp<CPUDevice, Eigen::half>);
+#define REGISTER_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv2DOp<CPUDevice, T>);
+
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
 #if GOOGLE_CUDA

 int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@ -21,6 +21,7 @@ limitations under the License.

 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@ -120,15 +121,13 @@ class Conv3DOp : public BinaryOp<T> {
  Padding padding_;
 };

-REGISTER_KERNEL_BUILDER(
-    Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Conv3DOp<CPUDevice, float>);
-
-#ifndef IS_MOBILE_PLATFORM
-REGISTER_KERNEL_BUILDER(
-    Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<double>("T"),
-    Conv3DOp<CPUDevice, double>);
-#endif
+#define REGISTER_CPU_KERNEL(T)                                  \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv3DOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL

 #if GOOGLE_CUDA

--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@ -566,16 +566,14 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
  TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
 };

-REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        DepthwiseConv2dNativeBackpropInputOp<CPUDevice, float>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNativeBackpropInput")
-        .Device(DEVICE_CPU)
-        .TypeConstraint<double>("T"),
-    DepthwiseConv2dNativeBackpropInputOp<CPUDevice, double>);
+#define REGISTER_CPU_KERNEL(T)                                       \
+  REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T"),               \
+                          DepthwiseConv2dNativeBackpropInputOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL

 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput")
@ -951,17 +949,15 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
  TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
 };

-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNativeBackpropFilter")
-        .Device(DEVICE_CPU)
-        .TypeConstraint<float>("T"),
-    DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, float>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNativeBackpropFilter")
-        .Device(DEVICE_CPU)
-        .TypeConstraint<double>("T"),
-    DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, double>);
+#define REGISTER_CPU_KERNEL(T)                    \
+  REGISTER_KERNEL_BUILDER(                        \
+      Name("DepthwiseConv2dNativeBackpropFilter") \
+          .Device(DEVICE_CPU)                     \
+          .TypeConstraint<T>("T"),                \
+      DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL

 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@ -376,14 +376,13 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
  TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeOp);
 };

-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    DepthwiseConv2dNativeOp<CPUDevice, float>);
+#define REGISTER_CPU_KERNEL(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      DepthwiseConv2dNativeOp<CPUDevice, T>);

-REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T"),
-                        DepthwiseConv2dNativeOp<CPUDevice, double>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);

 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(
--- a/tensorflow/core/kernels/draw_bounding_box_op.cc
+++ b/tensorflow/core/kernels/draw_bounding_box_op.cc
@ -143,13 +143,11 @@ class DrawBoundingBoxesOp : public OpKernel {
  }
 };

-REGISTER_KERNEL_BUILDER(
-    Name("DrawBoundingBoxes").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    DrawBoundingBoxesOp<float>);
-
-REGISTER_KERNEL_BUILDER(Name("DrawBoundingBoxes")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<Eigen::half>("T"),
-                        DrawBoundingBoxesOp<Eigen::half>);
+#define REGISTER_CPU_KERNEL(T)                                             \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("DrawBoundingBoxes").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      DrawBoundingBoxesOp<T>);
+TF_CALL_half(REGISTER_CPU_KERNEL);
+TF_CALL_float(REGISTER_CPU_KERNEL);

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@ -309,7 +309,7 @@ struct AvgPoolMeanReducer {

  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE AvgPoolMeanReducer() : scalarCount_(0) {
    typedef typename packet_traits<T>::type Packet;
-    packetCount_ = pset1<Packet>(0.0);
+    packetCount_ = pset1<Packet>(T(0.0));
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@ -21,6 +21,7 @@ limitations under the License.

 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"

 #if GOOGLE_CUDA
@ -202,17 +203,19 @@ struct MatMulFunctor<CPUDevice, T> {
                              .Label("cublas"),                    \
                          MatMulOp<GPUDevice, T, true /* cublas */>)

-REGISTER_CPU(float);
-REGISTER_CPU(double);
-REGISTER_CPU(int32);
-REGISTER_CPU(Eigen::half);
-REGISTER_CPU(complex64);
-REGISTER_CPU(complex128);
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
+TF_CALL_half(REGISTER_CPU);
+
+TF_CALL_int32(REGISTER_CPU);
+TF_CALL_complex64(REGISTER_CPU);
+TF_CALL_complex128(REGISTER_CPU);
+
 #if GOOGLE_CUDA
-REGISTER_GPU(float);
-REGISTER_GPU(double);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
 #if CUDA_VERSION >= 7050
-REGISTER_GPU(Eigen::half);
+TF_CALL_half(REGISTER_GPU);
 #endif
 #endif  // GOOGLE_CUDA

--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@ -160,7 +160,7 @@ static void SpatialMaxPoolWithArgMaxHelper(
      const int in_end = limit * in_size;
      EigenMatrixMap in_shard(input_backprop_flat.data() + in_start, 1,
                              in_end - in_start);
-      in_shard.setConstant(0);
+      in_shard.setConstant(T(0));

      // Backpropagate.
      const int out_size = out_height * out_width * depth;
@ -187,8 +187,12 @@ static void SpatialMaxPoolWithArgMaxHelper(
        params.tensor_in_batch, shard_cost, shard);
 }

-REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_CPU),
-                        MaxPoolingOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    MaxPoolingOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
+    MaxPoolingOp<CPUDevice, Eigen::half>);

 #if GOOGLE_CUDA
 // Forward declarations for the functor specializations for GPU.
@ -212,6 +216,7 @@ DECLARE_GPU_SPEC(float);
 // kernel_label_map.
 REGISTER_KERNEL_BUILDER(Name("MaxPool")
                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T")
                            .Label("eigen_tensor"),
                        MaxPoolingOp<Eigen::GpuDevice, float>);
 #endif  // GOOGLE_CUDA
@ -297,11 +302,16 @@ class MaxPoolingGradOp : public OpKernel {
  TensorFormat data_format_;
 };

-REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_CPU),
-                        MaxPoolingGradOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    MaxPoolingGradOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
+    MaxPoolingGradOp<CPUDevice, Eigen::half>);

 #ifdef GOOGLE_CUDA

+template <typename T>
 static void MaxPoolingBackwardCustomKernel(
    OpKernelContext* context, const std::vector<int32>& size,
    const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in,
@ -318,12 +328,12 @@ static void MaxPoolingBackwardCustomKernel(
  }

  MaxPoolBackwardNoMask(
-      tensor_in->flat<float>().data(), params.tensor_in_batch,
+      tensor_in->flat<T>().data(), params.tensor_in_batch,
      params.tensor_in_rows, params.tensor_in_cols, params.depth,
      params.out_height, params.out_width, params.window_rows,
      params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
-      params.pad_cols, out_backprop.flat<float>().data(),
-      output->flat<float>().data(), context->eigen_device<Eigen::GpuDevice>());
+      params.pad_cols, out_backprop.flat<T>().data(),
+      output->flat<T>().data(), context->eigen_device<Eigen::GpuDevice>());
 }

 template <class T>
@ -378,8 +388,8 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
    } else {
      CHECK(data_format_ == FORMAT_NHWC)
          << "Non-Cudnn MaxPoolGrad only supports NHWC format";
-      MaxPoolingBackwardCustomKernel(context, ksize_, stride_, padding_,
-                                     &tensor_in, out_backprop, output_shape);
+      MaxPoolingBackwardCustomKernel<T>(context, ksize_, stride_, padding_,
+                                        &tensor_in, out_backprop, output_shape);
    }
  }

@ -391,8 +401,12 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
  bool use_dnn_;
 };

-REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_GPU),
-                        MaxPoolingGradOp<Eigen::GpuDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint<float>("T"),
+    MaxPoolingGradOp<Eigen::GpuDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+    MaxPoolingGradOp<Eigen::GpuDevice, Eigen::half>);

 #endif  // GOOGLE_CUDA

@ -625,8 +639,12 @@ struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
  }
 };

-REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_GPU),
-                        MaxPoolingNoMaskOp<Eigen::GpuDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<float>("T"),
+    MaxPoolingNoMaskOp<Eigen::GpuDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+    MaxPoolingNoMaskOp<Eigen::GpuDevice, Eigen::half>);

 template <typename T>
 struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
@ -649,8 +667,14 @@ struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {

 REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
                            .Device(DEVICE_GPU)
-                            .TypeConstraint<int64>("Targmax"),
+                            .TypeConstraint<int64>("Targmax")
+                            .TypeConstraint<float>("T"),
                        MaxPoolingWithArgmaxOp<Eigen::GpuDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int64>("Targmax")
+                            .TypeConstraint<Eigen::half>("T"),
+                        MaxPoolingWithArgmaxOp<Eigen::GpuDevice, Eigen::half>);

 template <typename T>
 struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
@ -675,10 +699,18 @@ struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
  }
 };

-REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<int64>("Targmax"),
-                        MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPoolGradWithArgmax")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<float>("T")
+        .TypeConstraint<int64>("Targmax"),
+    MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPoolGradWithArgmax")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<Eigen::half>("T")
+        .TypeConstraint<int64>("Targmax"),
+    MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, Eigen::half>);

 #endif  // GOOGLE_CUDA

--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@ -110,7 +110,7 @@ __global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data,
    int wend = min(wstart + kernel_w, width);
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);
-    dtype maxval = -FLT_MAX;
+    dtype maxval = Eigen::NumTraits<dtype>::lowest();
    int maxidx = -1;
    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
    for (int h = hstart; h < hend; ++h) {
@ -149,7 +149,7 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
    int wend = min(wstart + kernel_w, width);
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);
-    dtype maxval = -FLT_MAX;
+    dtype maxval = Eigen::NumTraits<dtype>::lowest();
    int maxidx = -1;
    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
    for (int h = hstart; h < hend; ++h) {
@ -165,8 +165,8 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
    // Atomically accumulate the bottom diff. The index could still be
    // uninitialized, if all the bottom_data are NaN.
    if (maxidx != -1) {
-      atomicAdd(bottom_diff + n * height * width * channels + maxidx,
-                top_diff[index]);
+      CudaAtomicAdd(bottom_diff + n * height * width * channels + maxidx,
+                    top_diff[index]);
    }
  }
 }
@ -185,8 +185,8 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
 //     bottom_offset: the pre-computed per-image offset of the maxpool input.
 //         This is equal to H*W*C.
 //     bottom_diff: the gradient with respect to the input.
-// This function relies on atomicAdd to avoid race conditions. Also, before the
-// kernel is run, you will need to make sure that bottom_diff is filled with
+// This function relies on CudaAtomicAdd to avoid race conditions. Also, before
+// the kernel is run, you will need to make sure that bottom_diff is filled with
 // zero first.
 template <typename dtype>
 __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
@ -194,8 +194,8 @@ __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
                                const int bottom_offset, dtype* bottom_diff) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int image_id = (index / top_offset);
-    atomicAdd(bottom_diff + image_id * bottom_offset + mask[index],
-              top_diff[index]);
+    CudaAtomicAdd(bottom_diff + image_id * bottom_offset + mask[index],
+                  top_diff[index]);
  }
 }

@ -219,6 +219,23 @@ bool MaxPoolForwardWithOptionalArgmax(
  return d.ok();
 }

+bool MaxPoolForwardWithOptionalArgmax(
+    const Eigen::half* bottom_data, const int batch, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    Eigen::half* top_data, int64* mask, const Eigen::GpuDevice& d) {
+  const int kThreadsPerBlock = 1024;
+  const int output_size = batch * channels * pooled_height * pooled_width;
+
+  MaxPoolForwardNHWC<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+                       kThreadsPerBlock, 0, d.stream()>>>(
+      output_size, bottom_data, height, width, channels, pooled_height,
+      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
+      top_data, mask);
+  return d.ok();
+}
+
 bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
                           const int height, const int width,
                           const int channels, const int pooled_height,
@ -243,6 +260,30 @@ bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
  return d.ok();
 }

+bool MaxPoolBackwardNoMask(const Eigen::half* bottom_data, const int batch,
+                           const int height, const int width,
+                           const int channels, const int pooled_height,
+                           const int pooled_width, const int kernel_h,
+                           const int kernel_w, const int stride_h,
+                           const int stride_w, const int pad_t, const int pad_l,
+                           const Eigen::half* top_diff, Eigen::half* bottom_diff,
+                           const Eigen::GpuDevice& d) {
+  const int kThreadsPerBlock = 1024;
+  const int bottom_size = batch * channels * height * width;
+  const int top_size = batch * channels * pooled_height * pooled_width;
+
+  SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+            kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
+
+  MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) /
+                                  kThreadsPerBlock,
+                              kThreadsPerBlock, 0, d.stream()>>>(
+      top_size, bottom_data, height, width, channels, pooled_height,
+      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
+      top_diff, bottom_diff);
+  return d.ok();
+}
+
 bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
                               const float* top_diff, const int64* mask,
                               const int top_offset, const int bottom_offset,
@ -256,12 +297,27 @@ bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
  return d.ok();
 }

+bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
+                               const Eigen::half* top_diff, const int64* mask,
+                               const int top_offset, const int bottom_offset,
+                               Eigen::half* bottom_diff,
+                               const Eigen::GpuDevice& d) {
+  const int kThreadsPerBlock = 1024;
+  SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+            kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
+  MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+                    kThreadsPerBlock, 0, d.stream()>>>(
+      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+  return d.ok();
+}
+
 typedef Eigen::GpuDevice GPUDevice;

 #define DEFINE_GPU_KERNELS(T) \
  template struct functor::SpatialMaxPooling<GPUDevice, T>;

 DEFINE_GPU_KERNELS(float)
+DEFINE_GPU_KERNELS(Eigen::half)

 #undef DEFINE_GPU_KERNELS

--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@ -37,11 +37,24 @@ bool MaxPoolForwardWithOptionalArgmax(
    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
    float* top_data, int64* mask, const Eigen::GpuDevice& d);

+bool MaxPoolForwardWithOptionalArgmax(
+    const Eigen::half* bottom_data, const int batch, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    Eigen::half* top_data, int64* mask, const Eigen::GpuDevice& d);
+
 bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
                               const float* top_diff, const int64* mask,
                               const int top_offset, const int bottom_offset,
                               float* bottom_diff, const Eigen::GpuDevice& d);

+bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
+                               const Eigen::half* top_diff, const int64* mask,
+                               const int top_offset, const int bottom_offset,
+                               Eigen::half* bottom_diff,
+                               const Eigen::GpuDevice& d);
+
 bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
                           const int height, const int width,
                           const int channels, const int pooled_height,
@ -51,6 +64,15 @@ bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
                           const float* top_diff, float* bottom_diff,
                           const Eigen::GpuDevice& d);

+bool MaxPoolBackwardNoMask(const Eigen::half* bottom_data, const int batch,
+                           const int height, const int width,
+                           const int channels, const int pooled_height,
+                           const int pooled_width, const int kernel_h,
+                           const int kernel_w, const int stride_h,
+                           const int stride_w, const int pad_t, const int pad_l,
+                           const Eigen::half* top_diff, Eigen::half* bottom_diff,
+                           const Eigen::GpuDevice& d);
+
 }  // namespace tensorflow

 #endif  // TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@ -104,10 +104,8 @@ class PackOp : public OpKernel {
      PackOp<CPUDevice, type>)

 TF_CALL_ALL_TYPES(REGISTER_PACK);
-REGISTER_PACK(quint8);
-REGISTER_PACK(qint8);
-REGISTER_PACK(qint32);
-REGISTER_PACK(bfloat16);
+TF_CALL_QUANTIZED_TYPES(REGISTER_PACK);
+TF_CALL_bfloat16(REGISTER_PACK);

 #undef REGISTER_PACK

--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@ -124,6 +124,7 @@ namespace functor {
  extern template struct TransformDepth<GPUDevice, T, Eigen::DenseIndex>;

 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(Eigen::half);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor

@ -368,7 +369,9 @@ void DnnPoolingGradOp<T>::Compute(
  }
 }

+template class DnnPoolingOp<Eigen::half>;
 template class DnnPoolingOp<float>;
+template class DnnPoolingGradOp<Eigen::half>;
 template class DnnPoolingGradOp<float>;

 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@ -311,7 +311,7 @@ void SpatialAvgPool(OpKernelContext* context, Tensor* output,
      }
    }
  }
-  DCHECK_GT(out_count.minCoeff(), 0);
+  DCHECK_GT(out_count.minCoeff(), T(0));
  out_mat.array().rowwise() /= out_count.transpose().array();
 }

--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@ -23,6 +23,7 @@ limitations under the License.

 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
@ -107,15 +108,14 @@ struct QuantizeAndDequantizeOneScaleFunctor<CPUDevice, T> {
 };
 }  // namespace functor

-REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        QuantizeAndDequantizeOp<CPUDevice, float>);
-
-REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T"),
-                        QuantizeAndDequantizeOp<CPUDevice, double>);
+#define REGISTER_CPU_KERNEL(T)                           \
+  REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize") \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<T>("T"),   \
+                          QuantizeAndDequantizeOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL

 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize")
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@ -451,11 +451,11 @@ class MultinomialOp : public OpKernel {
                              .TypeConstraint<IntType>("Tout"), \
                          RandomUniformIntOp<CPUDevice, IntType>);

-REGISTER(Eigen::half);
-REGISTER(float);
-REGISTER(double);
-REGISTER_INT(int32);
-REGISTER_INT(int64);
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+TF_CALL_int32(REGISTER_INT);
+TF_CALL_int64(REGISTER_INT);

 #undef REGISTER
 #undef REGISTER_INT
@ -505,11 +505,11 @@ REGISTER_INT(int64);
                              .TypeConstraint<IntType>("Tout"), \
                          RandomUniformIntOp<GPUDevice, IntType>);

-REGISTER(Eigen::half);
-REGISTER(float);
-REGISTER(double);
-REGISTER_INT(int32);
-REGISTER_INT(int64);
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+TF_CALL_int32(REGISTER_INT);
+TF_CALL_int64(REGISTER_INT);

 #undef REGISTER
 #undef REGISTER_INT
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@ -25,8 +25,8 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 // NOTE: We should have mean(complex64,int32), too. But that needs to
 // change Eigen::internal::MeanReducer to cast int to complex<float>.
 // We don't see immediate need of mean(complex64,int32) anyway.
-REGISTER_CPU_KERNELS(complex64);
-REGISTER_CPU_KERNELS(complex128);
+TF_CALL_complex64(REGISTER_CPU_KERNELS);
+TF_CALL_complex128(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS

 #if GOOGLE_CUDA
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@ -159,15 +159,12 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL);

 #undef REGISTER_KERNEL

-REGISTER_KERNEL_BUILDER(Name("ResizeBilinearGrad")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        ResizeBilinearOpGrad<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("ResizeBilinearGrad")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<Eigen::half>("T"),
-                        ResizeBilinearOpGrad<CPUDevice, Eigen::half>);
-REGISTER_KERNEL_BUILDER(
-    Name("ResizeBilinearGrad").Device(DEVICE_CPU).TypeConstraint<double>("T"),
-    ResizeBilinearOpGrad<CPUDevice, double>);
+#define REGISTER_CPU_GRAD_KERNEL(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("ResizeBilinearGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      ResizeBilinearOpGrad<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU_GRAD_KERNEL);
+TF_CALL_float(REGISTER_CPU_GRAD_KERNEL);
+TF_CALL_double(REGISTER_CPU_GRAD_KERNEL);
+
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@ -97,13 +97,13 @@ class ReverseOp : public OpKernel {
                              .HostMemory("dims"),    \
                          ReverseOp<CPUDevice, T>)

-REGISTER_KERNEL(uint8);
-REGISTER_KERNEL(int8);
-REGISTER_KERNEL(int32);
-REGISTER_KERNEL(bool);
-REGISTER_KERNEL(Eigen::half);
-REGISTER_KERNEL(float);
-REGISTER_KERNEL(double);
+TF_CALL_uint8(REGISTER_KERNEL);
+TF_CALL_int8(REGISTER_KERNEL);
+TF_CALL_int32(REGISTER_KERNEL);
+TF_CALL_bool(REGISTER_KERNEL);
+TF_CALL_half(REGISTER_KERNEL);
+TF_CALL_float(REGISTER_KERNEL);
+TF_CALL_double(REGISTER_KERNEL);
 #undef REGISTER_KERNEL

 #if GOOGLE_CUDA
@ -129,13 +129,13 @@ namespace functor {
  DECLARE_GPU_SPEC_DIM(T, 7) \
  DECLARE_GPU_SPEC_DIM(T, 8)

-DECLARE_GPU_SPEC(uint8);
-DECLARE_GPU_SPEC(int8);
-DECLARE_GPU_SPEC(int32);
-DECLARE_GPU_SPEC(bool);
-DECLARE_GPU_SPEC(Eigen::half);
-DECLARE_GPU_SPEC(float);
-DECLARE_GPU_SPEC(double);
+TF_CALL_uint8(DECLARE_GPU_SPEC);
+TF_CALL_int8(DECLARE_GPU_SPEC);
+TF_CALL_int32(DECLARE_GPU_SPEC);
+TF_CALL_bool(DECLARE_GPU_SPEC);
+TF_CALL_half(DECLARE_GPU_SPEC);
+TF_CALL_float(DECLARE_GPU_SPEC);
+TF_CALL_double(DECLARE_GPU_SPEC);
 #undef DECLARE_GPU_SPEC
 #undef DECLARE_GPU_SPEC_DIM
 }  // namespace functor
@ -147,11 +147,11 @@ DECLARE_GPU_SPEC(double);
                              .TypeConstraint<T>("T") \
                              .HostMemory("dims"),    \
                          ReverseOp<GPUDevice, T>)
-REGISTER_GPU_KERNEL(uint8);
-REGISTER_GPU_KERNEL(int8);
-REGISTER_GPU_KERNEL(Eigen::half);
-REGISTER_GPU_KERNEL(float);
-REGISTER_GPU_KERNEL(double);
+TF_CALL_uint8(REGISTER_GPU_KERNEL);
+TF_CALL_int8(REGISTER_GPU_KERNEL);
+TF_CALL_half(REGISTER_GPU_KERNEL);
+TF_CALL_float(REGISTER_GPU_KERNEL);
+TF_CALL_double(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL

 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@ -118,21 +118,16 @@ class LinSpaceOp : public OpKernel {
  }
 };

-REGISTER_KERNEL_BUILDER(Name("LinSpace")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("start")
-                            .HostMemory("stop")
-                            .HostMemory("num")
-                            .HostMemory("output"),
-                        LinSpaceOp<float>);
-REGISTER_KERNEL_BUILDER(Name("LinSpace")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T")
-                            .HostMemory("start")
-                            .HostMemory("stop")
-                            .HostMemory("num")
-                            .HostMemory("output"),
-                        LinSpaceOp<double>);
+#define REGISTER_CPU_KERNEL(T)                        \
+  REGISTER_KERNEL_BUILDER(Name("LinSpace")            \
+                              .Device(DEVICE_CPU)     \
+                              .TypeConstraint<T>("T") \
+                              .HostMemory("start")    \
+                              .HostMemory("stop")     \
+                              .HostMemory("num")      \
+                              .HostMemory("output"),  \
+                          LinSpaceOp<T>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/softmax_op.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"

@ -40,27 +41,22 @@ struct SoftmaxFunctor<CPUDevice, T> {
 };
 }  // namespace functor

-REGISTER_KERNEL_BUILDER(
-    Name("Softmax").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
-    SoftmaxOp<CPUDevice, Eigen::half>);
-REGISTER_KERNEL_BUILDER(Name("Softmax")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        SoftmaxOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Softmax")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T"),
-                        SoftmaxOp<CPUDevice, double>);
-REGISTER_KERNEL_BUILDER(
-    Name("LogSoftmax").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
-    SoftmaxOp<CPUDevice, Eigen::half>);
-REGISTER_KERNEL_BUILDER(
-    Name("LogSoftmax").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    SoftmaxOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("LogSoftmax")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T"),
-                        SoftmaxOp<CPUDevice, double>);
+#define REGISTER_CPU(T)                                          \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("Softmax").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      SoftmaxOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
+
+#undef REGISTER_CPU
+#define REGISTER_CPU(T)                                             \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("LogSoftmax").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      SoftmaxOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);

 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@ -157,10 +157,15 @@ EIGEN_STRONG_INLINE Packet8f pinterleave4x64<Packet8f>(const Packet8f& from) {
  return _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(from),
                                                      _MM_SHUFFLE(3, 1, 2, 0)));
 #else
-  __int64_t tmp1 = _mm256_extract_epi64(_mm256_castps_si256(from), 1);
-  __int64_t tmp2 = _mm256_extract_epi64(_mm256_castps_si256(from), 2);
-  __m256i tmp3 = _mm256_insert_epi64(_mm256_castps_si256(from), tmp1, 2);
-  return _mm256_castsi256_ps(_mm256_insert_epi64(tmp3, tmp2, 1));
+  auto tmp1 = _mm256_extract_epi32(_mm256_castps_si256(from), 2);
+  auto tmp2 = _mm256_extract_epi32(_mm256_castps_si256(from), 3);
+  auto tmp3 = _mm256_extract_epi32(_mm256_castps_si256(from), 4);
+  auto tmp4 = _mm256_extract_epi32(_mm256_castps_si256(from), 5);
+  auto tmp5 = _mm256_insert_epi32(_mm256_castps_si256(from), tmp1, 4);
+  tmp5 = _mm256_insert_epi32(tmp5, tmp2, 5);
+  tmp5 = _mm256_insert_epi32(tmp5, tmp3, 2);
+  tmp5 = _mm256_insert_epi32(tmp5, tmp4, 3);
+  return _mm256_castsi256_ps(tmp5);
 #endif
 }
 // Return a Packet with 4 floats loaded from 4 bfloat16 values
--- a/tensorflow/core/kernels/sparse_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@ -238,25 +238,25 @@ class SparseMatmulOpTest : public ::testing::Test {

 TEST_F(SparseMatmulOpTest, BroadcastPacketTest) {
  for (int i = 0; i < PacketSize; ++i) ref[i] = data1[0];
-  internal::pstore(data2, internal::pbroadcast_first<Packet>(
-                              internal::pload<Packet>(data1)));
+  internal::pstoreu(data2, internal::pbroadcast_first<Packet>(
+                               internal::ploadu<Packet>(data1)));
  ASSERT_TRUE(areApprox(ref, data2, PacketSize));
  if (PacketSize > 1) {
    for (int i = 0; i < PacketSize; ++i) ref[i] = data1[1];
-    internal::pstore(data2, internal::pbroadcast_second<Packet>(
-                                internal::pload<Packet>(data1)));
+    internal::pstoreu(data2, internal::pbroadcast_second<Packet>(
+                                 internal::ploadu<Packet>(data1)));
    ASSERT_TRUE(areApprox(ref, data2, PacketSize));

    if (PacketSize > 2) {
      for (int i = 0; i < PacketSize; ++i) ref[i] = data1[2];
-      internal::pstore(data2, internal::pbroadcast_third<Packet>(
-                                  internal::pload<Packet>(data1)));
+      internal::pstoreu(data2, internal::pbroadcast_third<Packet>(
+                                   internal::ploadu<Packet>(data1)));
      ASSERT_TRUE(areApprox(ref, data2, PacketSize));

      if (PacketSize > 3) {
        for (int i = 0; i < PacketSize; ++i) ref[i] = data1[3];
-        internal::pstore(data2, internal::pbroadcast_fourth<Packet>(
-                                    internal::pload<Packet>(data1)));
+        internal::pstoreu(data2, internal::pbroadcast_fourth<Packet>(
+                                     internal::ploadu<Packet>(data1)));
        ASSERT_TRUE(areApprox(ref, data2, PacketSize));
      }
    }
@ -276,8 +276,8 @@ TEST_F(SparseMatmulOpTest, InterleavePacketTest) {
    for (int i = 0; i < PacketSize; ++i) ref[i] = data1[i];
  }

-  internal::pstore(
-      data2, internal::pinterleave4x64<Packet>(internal::pload<Packet>(data1)));
+  internal::pstoreu(data2, internal::pinterleave4x64<Packet>(
+                               internal::ploadu<Packet>(data1)));
  ASSERT_TRUE(areApprox(ref, data2, PacketSize));
 }

@ -294,8 +294,8 @@ TEST_F(SparseMatmulOpTest, Bfloat16ExpandTest) {
      ref[i] = data3[i];
    }
  }
-  internal::pstore(data2, internal::pexpand_bf16_l<Packet>(
-                              internal::pload<Packet>(data3_bfloat16)));
+  internal::pstoreu(data2, internal::pexpand_bf16_l<Packet>(
+                               internal::ploadu<Packet>(data3_bfloat16)));
  ASSERT_TRUE(areApprox(ref, data2, PacketSize));

  if (PacketSize == 8) {  // AVX
@ -311,18 +311,18 @@ TEST_F(SparseMatmulOpTest, Bfloat16ExpandTest) {
    }
  }

-  internal::pstore(data2, internal::pexpand_bf16_u<Packet>(
-                              internal::pload<Packet>(data3_bfloat16)));
+  internal::pstoreu(data2, internal::pexpand_bf16_u<Packet>(
+                               internal::ploadu<Packet>(data3_bfloat16)));
  ASSERT_TRUE(areApprox(ref, data2, PacketSize));
 }

 TEST_F(SparseMatmulOpTest, Bfloat16LoadTest) {
  if (PacketSize >= 4) {
    for (int i = 0; i < 4; ++i) ref[i] = data3[i];
-    internal::pstore(data2, internal::pload4bf16<Packet>(data3_bfloat16));
+    internal::pstoreu(data2, internal::pload4bf16<Packet>(data3_bfloat16));
    ASSERT_TRUE(areApprox(ref, data2, 4));

-    internal::pstore(data2, internal::pload2bf16<Packet>(data3_bfloat16));
+    internal::pstoreu(data2, internal::pload2bf16<Packet>(data3_bfloat16));
    ASSERT_TRUE(areApprox(ref, data2, 2));
  }
 }
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@ -26,6 +26,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/type_index.h"
 #include "tensorflow/core/lib/core/errors.h"
@ -92,18 +93,22 @@ class TileOp : public OpKernel {
  HANDLE_DIM(T, 4)     \
  HANDLE_DIM(T, 5)

-    HANDLE_TYPE(DT_BOOL);
-    HANDLE_TYPE(DT_FLOAT);
-    HANDLE_TYPE(DT_DOUBLE);
-    HANDLE_TYPE(DT_UINT8);
-    HANDLE_TYPE(DT_INT32);
-    HANDLE_TYPE(DT_INT16);
-    HANDLE_TYPE(DT_INT64);
-    HANDLE_TYPE(DT_HALF);
-    HANDLE_TYPE(DT_COMPLEX64);
-    HANDLE_TYPE(DT_COMPLEX128);
-    HANDLE_TYPE(DT_STRING);  // when DEVICE=CPUDevice.
+#define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum<T>::value)

+    // Invoke macro using TF_CALL_* so type-filtering for platform applies.
+    TF_CALL_bool(HANDLE_TYPE_NAME);
+    TF_CALL_float(HANDLE_TYPE_NAME);
+    TF_CALL_double(HANDLE_TYPE_NAME);
+    TF_CALL_uint8(HANDLE_TYPE_NAME);
+    TF_CALL_int32(HANDLE_TYPE_NAME);
+    TF_CALL_int16(HANDLE_TYPE_NAME);
+    TF_CALL_int64(HANDLE_TYPE_NAME);
+    TF_CALL_half(HANDLE_TYPE_NAME);
+    TF_CALL_string(HANDLE_TYPE_NAME);  // when DEVICE=CPUDevice.
+    TF_CALL_complex64(HANDLE_TYPE_NAME);
+    TF_CALL_complex128(HANDLE_TYPE_NAME);
+
+#undef HANDLE_TYPE_NAME
 #undef HANDLE_TYPE
 #undef HANDLE_DIM

@ -165,17 +170,20 @@ inline void TileOp<Device>::HandleCase(
  HANDLE_CASE(device, dtype, 4);       \
  HANDLE_CASE(device, dtype, 5);

-HANDLE_CASE_DIM(CPUDevice, DT_BOOL);
-HANDLE_CASE_DIM(CPUDevice, DT_FLOAT);
-HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE);
-HANDLE_CASE_DIM(CPUDevice, DT_UINT8);
-HANDLE_CASE_DIM(CPUDevice, DT_INT32);
-HANDLE_CASE_DIM(CPUDevice, DT_INT16);
-HANDLE_CASE_DIM(CPUDevice, DT_INT64);
-HANDLE_CASE_DIM(CPUDevice, DT_HALF);
-HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX64);
-HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX128);
-HANDLE_CASE_DIM(CPUDevice, DT_STRING);
+#define HANDLE_TYPE_NAME_CPU(T) \
+  HANDLE_CASE_DIM(CPUDevice, DataTypeToEnum<T>::value);
+
+TF_CALL_bool(HANDLE_TYPE_NAME_CPU);
+TF_CALL_float(HANDLE_TYPE_NAME_CPU);
+TF_CALL_double(HANDLE_TYPE_NAME_CPU);
+TF_CALL_uint8(HANDLE_TYPE_NAME_CPU);
+TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
+TF_CALL_int16(HANDLE_TYPE_NAME_CPU);
+TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
+TF_CALL_half(HANDLE_TYPE_NAME_CPU);
+TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
+TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
+TF_CALL_string(HANDLE_TYPE_NAME_CPU);

 #if GOOGLE_CUDA
 HANDLE_CASE_DIM(GPUDevice, DT_FLOAT);
@ -186,6 +194,7 @@ HANDLE_CASE_DIM(GPUDevice, DT_INT64);
 HANDLE_CASE_DIM(GPUDevice, DT_HALF);
 #endif  // GOOGLE_CUDA

+#undef HANDLE_TYPE_NAME_CPU
 #undef HANDLE_CASE_DIM
 #undef HANDLE_CASE

@ -249,13 +258,16 @@ class TileGradientOp : public OpKernel {
  HANDLE_DIM(T, 4)     \
  HANDLE_DIM(T, 5)

-    HANDLE_TYPE(DT_FLOAT);
-    HANDLE_TYPE(DT_DOUBLE);
-    HANDLE_TYPE(DT_INT32);
-    HANDLE_TYPE(DT_INT16);
-    HANDLE_TYPE(DT_INT64);
-    HANDLE_TYPE(DT_HALF);
+#define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum<T>::value)

+    TF_CALL_float(HANDLE_TYPE_NAME);
+    TF_CALL_double(HANDLE_TYPE_NAME);
+    TF_CALL_int32(HANDLE_TYPE_NAME);
+    TF_CALL_int16(HANDLE_TYPE_NAME);
+    TF_CALL_int64(HANDLE_TYPE_NAME);
+    TF_CALL_half(HANDLE_TYPE_NAME);
+
+#undef HANDLE_TYPE_NAME
 #undef HANDLE_TYPE
 #undef HANDLE_DIM

@ -390,14 +402,17 @@ inline void TileGradientOp<Device>::HandleCase(
  HANDLE_CASE(device, dtype, 4);       \
  HANDLE_CASE(device, dtype, 5);

-HANDLE_CASE_DIM(CPUDevice, DT_FLOAT);
-HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE);
-HANDLE_CASE_DIM(CPUDevice, DT_INT16);
-HANDLE_CASE_DIM(CPUDevice, DT_INT32);
-HANDLE_CASE_DIM(CPUDevice, DT_INT64);
-HANDLE_CASE_DIM(CPUDevice, DT_HALF);
-HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX64);
-HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX128);
+#define HANDLE_TYPE_NAME_CPU(T) \
+  HANDLE_CASE_DIM(CPUDevice, DataTypeToEnum<T>::value);
+
+TF_CALL_float(HANDLE_TYPE_NAME_CPU);
+TF_CALL_double(HANDLE_TYPE_NAME_CPU);
+TF_CALL_int16(HANDLE_TYPE_NAME_CPU);
+TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
+TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
+TF_CALL_half(HANDLE_TYPE_NAME_CPU);
+TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
+TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);

 #if GOOGLE_CUDA
 HANDLE_CASE_DIM(GPUDevice, DT_FLOAT);
@ -409,6 +424,7 @@ HANDLE_CASE_DIM(GPUDevice, DT_HALF);

 #endif  // GOOGLE_CUDA

+#undef HANDLE_TYPE_NAME_CPU
 #undef HANDLE_CASE_DIM
 #undef HANDLE_CASE

--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@ -16,17 +16,28 @@ limitations under the License.
 #define EIGEN_USE_THREADS

 #include "tensorflow/core/kernels/training_ops.h"
+#include <algorithm>
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/bounds_check.h"

 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"

 namespace tensorflow {

 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;

-namespace functor {
+namespace {
+template <class T>
+inline T sgn(const T x) {
+  T zero(0);
+  T one(1);
+  return (x == zero ? zero : (x < zero ? -one : one));
+}
+}

+namespace functor {
 template <typename T>
 struct ApplyGradientDescent<CPUDevice, T> {
  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@ -56,6 +67,34 @@ struct ApplyAdadelta<CPUDevice, T> {
  }
 };

+template <typename T>
+struct ApplyProximalGradientDescent<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad) {
+    // Note that here is Fobos update, for details please refer:
+    // http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf
+    // TODO(xbing): merge the logic for ProximalGradientDescent and
+    // ProximalAdagrad.
+    auto prox_var = var;
+    // compute v = w - lr * grad.
+    prox_var.device(d) -= grad * lr();
+    if (l1() > 0) {
+      var.device(d) = prox_var.abs() - var.constant(lr() * l1());
+      // compute sign(v) * max(|v| - lr * l1, 0)
+      var.device(d) = prox_var.sign() * var.cwiseMax(T(0.0));
+    } else {
+      var.device(d) = prox_var;
+    }
+    if (l2() > 0) {
+      // compute v / (1.0 + l2 * lr)
+      var.device(d) = var / (var.constant(1.0) + var.constant(l2() * lr()));
+    }
+  }
+};
+
 template <typename T>
 struct ApplyAdagrad<CPUDevice, T> {
  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@ -67,6 +106,35 @@ struct ApplyAdagrad<CPUDevice, T> {
  }
 };

+template <typename T>
+struct ApplyProximalAdagrad<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad) {
+    // Fobos update per paper with Adagrad learning rate.
+    accum.device(d) += grad.square();
+    // Adagrad learning rate.
+    auto learning_rate = accum.constant(lr()) * accum.rsqrt();
+    auto prox_var = var;
+    // compute v = w - lr * grad.
+    prox_var.device(d) -= grad * learning_rate;
+    if (l1() > 0) {
+      var.device(d) = prox_var.abs() - learning_rate * prox_var.constant(l1());
+      // compute sign(v) * max(|v| - lr * l1, 0)
+      var.device(d) = prox_var.sign() * var.cwiseMax(T(0.0));
+    } else {
+      var.device(d) = prox_var;
+    }
+    if (l2() > 0) {
+      var.device(d) =
+          var / (var.constant(1.0) + var.constant(l2()) * learning_rate);
+    }
+  }
+};
+
 template <typename T>
 struct ApplyFtrl<CPUDevice, T> {
  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@ -221,10 +289,11 @@ class ApplyGradientDescentOp : public OpKernel {
  REGISTER_KERNEL_BUILDER(                                                    \
      Name("ApplyGradientDescent").Device(DEVICE_##D).TypeConstraint<T>("T"), \
      ApplyGradientDescentOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);

-REGISTER_KERNELS(CPU, Eigen::half);
-REGISTER_KERNELS(CPU, float);
-REGISTER_KERNELS(CPU, double);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);

 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
@ -246,6 +315,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
 #endif
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS

 template <typename Device, typename T>
@ -347,10 +417,11 @@ typedef Eigen::GpuDevice GPUDevice;
  REGISTER_KERNEL_BUILDER(                                             \
      Name("ApplyAdadelta").Device(DEVICE_##D).TypeConstraint<T>("T"), \
      ApplyAdadeltaOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);

-REGISTER_KERNELS(CPU, Eigen::half);
-REGISTER_KERNELS(CPU, float);
-REGISTER_KERNELS(CPU, double);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);

 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
@ -374,6 +445,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
 #endif
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS

 // Note, this op works on cpu only.
@ -483,7 +555,6 @@ class SparseApplyAdadeltaOp : public OpKernel {
        accum_update_ =
            accum_update_ * accum_update_.constant(rho_scalar) +
            update.square() * update.constant(static_cast<T>(1) - rho_scalar);
-
        auto v = var_flat.template chip<0>(index);
        v -= update * update.constant(lr_scalar);
      }
@ -505,14 +576,213 @@ class SparseApplyAdadeltaOp : public OpKernel {
                              .TypeConstraint<T>("T")                \
                              .TypeConstraint<Tindices>("Tindices"), \
                          SparseApplyAdadeltaOp<T, Tindices>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename Device, typename T>
+class ApplyProximalGradientDescentOp : public OpKernel {
+ public:
+  explicit ApplyProximalGradientDescentOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    const Tensor& alpha = ctx->input(1);
+    OP_REQUIRES(ctx, IsLegacyScalar(alpha.shape()),
+                errors::InvalidArgument("alpha is not a scalar: ",
+                                        alpha.shape().DebugString()));
+    const Tensor& l1 = ctx->input(2);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l1.shape()),
+        errors::InvalidArgument("l1 regularization strength is not a scalar: ",
+                                l1.shape().DebugString()));
+    const Tensor& l2 = ctx->input(3);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l2.shape()),
+        errors::InvalidArgument("l2 regularization strength is not a scalar: ",
+                                l2.shape().DebugString()));
+
+    const Tensor& delta = ctx->input(4);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(delta.shape()),
+        errors::InvalidArgument("var and delta do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                delta.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyProximalGradientDescent<Device, T>()(
+        device, var.flat<T>(), alpha.scalar<T>(), l1.scalar<T>(),
+        l2.scalar<T>(), delta.flat<T>());
+
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                 \
+  REGISTER_KERNEL_BUILDER(Name("ApplyProximalGradientDescent") \
+                              .Device(DEVICE_##D)              \
+                              .TypeConstraint<T>("T"),         \
+                          ApplyProximalGradientDescentOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename T, typename Tindex>
+class SparseApplyProximalGradientDescentOp : public OpKernel {
+ public:
+  explicit SparseApplyProximalGradientDescentOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
+                errors::InvalidArgument("var must be at least 1 dimensional"));
+
+    const Tensor& lr = ctx->input(1);
+    OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& l1 = ctx->input(2);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l1.shape()),
+        errors::InvalidArgument("l1 regularization strength is not a scalar: ",
+                                l1.shape().DebugString()));
+    const Tensor& l2 = ctx->input(3);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l2.shape()),
+        errors::InvalidArgument("l2 regularization strength is not a scalar: ",
+                                l2.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(4);
+    const Tensor& indices = ctx->input(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    int64 inner_dim = 1;
+    for (int d = 1; d < var.dims(); d++) {
+      OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d)));
+      inner_dim *= grad.dim_size(d);
+    }
+    const Tindex N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    if (N > 0) {
+      if (inner_dim > 1) {
+        const Tindex first_dim_size = var.dim_size(0);
+        auto indices_vec = indices.vec<Tindex>();
+        auto var_flat = var.flat_outer_dims<T>();
+        auto grad_flat = grad.flat_outer_dims<T>();
+        T lr_scalar = lr.scalar<T>()();
+        T l1_scalar = l1.scalar<T>()();
+        T l2_scalar = l2.scalar<T>()();
+
+        // TODO(xbing): extract the common logic for the Fobos update.
+        for (Tindex i = 0; i < N; i++) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                      errors::InvalidArgument(
+                          strings::StrCat("Index ", index, " at offset ", i,
+                                          " in indices is out of range")));
+          auto g = grad_flat.template chip<0>(i);
+          auto v = var_flat.template chip<0>(index);
+          // compute learning_rate for current step.
+          auto learning_rate = v.constant(lr_scalar);
+          auto prox_v = v;
+          // v = w - g * learning_rate.
+          prox_v -= g * learning_rate;
+          if (l1_scalar > 0) {
+            v = prox_v.abs() - learning_rate * prox_v.constant(l1_scalar);
+            // compute sign(v) * max(|v|, 0)
+            v = prox_v.sign() * v.cwiseMax(static_cast<T>(0.0));
+          } else {
+            v = prox_v;
+          }
+          if (l2_scalar > 0) {
+            v /= (v.constant(1.0) + v.constant(l2_scalar) * learning_rate);
+          }
+        }
+      } else {
+        CHECK_EQ(1, inner_dim);
+        auto indices_vec = indices.vec<Tindex>();
+        auto var_flat = var.flat<T>();
+        auto grad_flat = grad.flat<T>();
+        T lr_scalar = lr.scalar<T>()();
+        T l1_scalar = l1.scalar<T>()();
+        T l2_scalar = l2.scalar<T>()();
+        const Tindex first_dim_size = var_flat.size();
+
+        for (Tindex i = 0; i < N; i++) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                      errors::InvalidArgument(
+                          strings::StrCat("Index ", index, " at offset ", i,
+                                          " in indices is out of range")));
+          const T& g = grad_flat(i);
+          auto learning_rate = lr_scalar;
+          auto prox_v = var_flat(index);
+          prox_v -= learning_rate * g;
+          if (l1_scalar > 0) {
+            var_flat(index) = std::abs(prox_v) - learning_rate * l1_scalar;
+            var_flat(index) =
+                sgn(prox_v) * std::max(var_flat(index), static_cast<T>(0.0));
+          } else {
+            var_flat(index) = prox_v;
+          }
+          if (l2_scalar > 0) {
+            var_flat(index) /= (1.0 + l2_scalar * learning_rate);
+          }
+        }
+      }
+    }
+
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(T, Tindices)                                \
+  REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalGradientDescent") \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyProximalGradientDescentOp<T, Tindices>);

-REGISTER_KERNELS(Eigen::half, int32);
-REGISTER_KERNELS(Eigen::half, int64);
 REGISTER_KERNELS(float, int32);
 REGISTER_KERNELS(float, int64);
 REGISTER_KERNELS(double, int32);
 REGISTER_KERNELS(double, int64);
-
 #undef REGISTER_KERNELS

 template <typename Device, typename T>
@ -568,10 +838,11 @@ typedef Eigen::GpuDevice GPUDevice;
  REGISTER_KERNEL_BUILDER(                                            \
      Name("ApplyAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
      ApplyAdagradOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);

-REGISTER_KERNELS(CPU, Eigen::half);
-REGISTER_KERNELS(CPU, float);
-REGISTER_KERNELS(CPU, double);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);

 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
@ -593,15 +864,80 @@ REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
 #endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+template <typename Device, typename T>
+class ApplyProximalAdagradOp : public OpKernel {
+ public:
+  explicit ApplyProximalAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(1)));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& l1 = ctx->input(3);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l1.shape()),
+        errors::InvalidArgument("l1 regularization strength is not a scalar: ",
+                                l1.shape().DebugString()));
+    const Tensor& l2 = ctx->input(4);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l2.shape()),
+        errors::InvalidArgument("l2 regularization strength is not a scalar: ",
+                                l2.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(5);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyProximalAdagrad<Device, T>()(
+        device, var.flat<T>(), accum.flat<T>(), lr.scalar<T>(), l1.scalar<T>(),
+        l2.scalar<T>(), grad.flat<T>());
+
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_KERNELS(D, T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("ApplyProximalAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyProximalAdagradOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
 #undef REGISTER_KERNELS

 namespace {
-template <class T>
-inline T sgn(const T x) {
-  T zero(0);
-  T one(1);
-  return (x == zero ? zero : (x < zero ? -one : one));
-}

 template <typename T>
 inline T FtrlCompute(const T& accum, const T& linear, const T& lr, const T& l1,
@ -730,9 +1066,167 @@ class SparseApplyAdagradOp : public OpKernel {
                              .TypeConstraint<T>("T")                \
                              .TypeConstraint<Tindices>("Tindices"), \
                          SparseApplyAdagradOp<T, Tindices>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename T, typename Tindex>
+class SparseApplyProximalAdagradOp : public OpKernel {
+ public:
+  explicit SparseApplyProximalAdagradOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(1)));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
+                errors::InvalidArgument("var must be at least 1 dimensional"));
+
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& l1 = ctx->input(3);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l1.shape()),
+        errors::InvalidArgument("l1 regularization strength is not a scalar: ",
+                                l1.shape().DebugString()));
+    const Tensor& l2 = ctx->input(4);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l2.shape()),
+        errors::InvalidArgument("l2 regularization strength is not a scalar: ",
+                                l2.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(5);
+    const Tensor& indices = ctx->input(6);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    int64 inner_dim = 1;
+    for (int d = 1; d < var.dims(); d++) {
+      OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d)));
+      inner_dim *= grad.dim_size(d);
+    }
+    const Tindex N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    if (N > 0) {
+      if (inner_dim > 1) {
+        const Tindex first_dim_size = var.dim_size(0);
+        auto indices_vec = indices.vec<Tindex>();
+        auto var_flat = var.flat_outer_dims<T>();
+        auto accum_flat = accum.flat_outer_dims<T>();
+        auto grad_flat = grad.flat_outer_dims<T>();
+        T lr_scalar = lr.scalar<T>()();
+        T l1_scalar = l1.scalar<T>()();
+        T l2_scalar = l2.scalar<T>()();
+
+        for (Tindex i = 0; i < N; i++) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                      errors::InvalidArgument(
+                          strings::StrCat("Index ", index, " at offset ", i,
+                                          " in indices is out of range")));
+          auto a = accum_flat.template chip<0>(index);
+          auto g = grad_flat.template chip<0>(i);
+          auto v = var_flat.template chip<0>(index);
+          a += g.square();
+          // compute learning_rate for current step.
+          auto learning_rate = a.constant(lr_scalar) * a.rsqrt();
+          auto prox_v = v;
+          // v = w - g * learning_rate.
+          prox_v -= g * learning_rate;
+          if (l1_scalar > 0) {
+            v = prox_v.abs() - learning_rate * prox_v.constant(l1_scalar);
+            // compute sign(v) * max(|v|, 0)
+            v = prox_v.sign() * v.cwiseMax(static_cast<T>(0.0));
+          } else {
+            v = prox_v;
+          }
+          if (l2_scalar > 0) {
+            v /= (v.constant(1.0) + v.constant(l2_scalar) * learning_rate);
+          }
+        }
+      } else {
+        CHECK_EQ(1, inner_dim);
+        auto indices_vec = indices.vec<Tindex>();
+        auto var_flat = var.flat<T>();
+        auto accum_flat = accum.flat<T>();
+        auto grad_flat = grad.flat<T>();
+        T lr_scalar = lr.scalar<T>()();
+        T l1_scalar = l1.scalar<T>()();
+        T l2_scalar = l2.scalar<T>()();
+        const Tindex first_dim_size = accum_flat.size();
+
+        for (Tindex i = 0; i < N; i++) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                      errors::InvalidArgument(
+                          strings::StrCat("Index ", index, " at offset ", i,
+                                          " in indices is out of range")));
+          T& a = accum_flat(index);
+          const T& g = grad_flat(i);
+          a += g * g;
+          auto learning_rate = lr_scalar / std::sqrt(a);
+          auto prox_v = var_flat(index);
+          prox_v -= learning_rate * g;
+          if (l1_scalar > 0) {
+            var_flat(index) = std::abs(prox_v) - learning_rate * l1_scalar;
+            var_flat(index) =
+                sgn(prox_v) * std::max(var_flat(index), static_cast<T>(0.0));
+          } else {
+            var_flat(index) = prox_v;
+          }
+          if (l2_scalar > 0) {
+            var_flat(index) /= (1.0 + l2_scalar * learning_rate);
+          }
+        }
+      }
+    }
+
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(T, Tindices)                                \
+  REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalAdagrad")         \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyProximalAdagradOp<T, Tindices>);

-REGISTER_KERNELS(Eigen::half, int32);
-REGISTER_KERNELS(Eigen::half, int64);
 REGISTER_KERNELS(float, int32);
 REGISTER_KERNELS(float, int64);
 REGISTER_KERNELS(double, int32);
@ -822,10 +1316,13 @@ typedef Eigen::GpuDevice GPUDevice;
  REGISTER_KERNEL_BUILDER(                                         \
      Name("ApplyFtrl").Device(DEVICE_##D).TypeConstraint<T>("T"), \
      ApplyFtrlOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);

-REGISTER_KERNELS(CPU, Eigen::half);
-REGISTER_KERNELS(CPU, float);
-REGISTER_KERNELS(CPU, double);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS

 // Note, this op works on cpu only.
@ -1002,13 +1499,15 @@ class SparseApplyFtrlOp : public OpKernel {
                              .TypeConstraint<T>("T")                \
                              .TypeConstraint<Tindices>("Tindices"), \
                          SparseApplyFtrlOp<CPUDevice, T, Tindices>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);

-REGISTER_KERNELS(Eigen::half, int32);
-REGISTER_KERNELS(Eigen::half, int64);
-REGISTER_KERNELS(float, int32);
-REGISTER_KERNELS(float, int64);
-REGISTER_KERNELS(double, int32);
-REGISTER_KERNELS(double, int64);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS

 template <typename Device, typename T>
@ -1070,10 +1569,11 @@ typedef Eigen::GpuDevice GPUDevice;
  REGISTER_KERNEL_BUILDER(                                             \
      Name("ApplyMomentum").Device(DEVICE_##D).TypeConstraint<T>("T"), \
      ApplyMomentumOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);

-REGISTER_KERNELS(CPU, Eigen::half);
-REGISTER_KERNELS(CPU, float);
-REGISTER_KERNELS(CPU, double);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);

 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
@ -1096,6 +1596,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
 #endif
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS

 // Note, this op works on cpu only.
@ -1129,7 +1630,7 @@ class SparseApplyMomentumOp : public OpKernel {

    const Tensor& lr = ctx->input(2);
    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
-                errors::InvalidArgument("lr is not a scalar: ",
+                errors::InvalidArgument("lr is not a scalar : ",
                                        lr.shape().DebugString()));
    const Tensor& grad = ctx->input(3);
    const Tensor& indices = ctx->input(4);
@ -1188,13 +1689,15 @@ class SparseApplyMomentumOp : public OpKernel {
                              .TypeConstraint<T>("T")                \
                              .TypeConstraint<Tindices>("Tindices"), \
                          SparseApplyMomentumOp<T, Tindices>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);

-REGISTER_KERNELS(Eigen::half, int32);
-REGISTER_KERNELS(Eigen::half, int64);
-REGISTER_KERNELS(float, int32);
-REGISTER_KERNELS(float, int64);
-REGISTER_KERNELS(double, int32);
-REGISTER_KERNELS(double, int64);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS

 template <typename Device, typename T>
@ -1237,7 +1740,7 @@ class ApplyAdamOp : public OpKernel {
                errors::InvalidArgument("beta2_power is not a scalar: ",
                                        beta2_power.shape().DebugString()));
    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
-                errors::InvalidArgument("lr is not a scalar: ",
+                errors::InvalidArgument("lr is not a scalar : ",
                                        lr.shape().DebugString()));
    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
                errors::InvalidArgument("beta1 is not a scalar: ",
@ -1285,10 +1788,11 @@ typedef Eigen::GpuDevice GPUDevice;
  REGISTER_KERNEL_BUILDER(                                         \
      Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"), \
      ApplyAdamOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);

-REGISTER_KERNELS(CPU, Eigen::half);
-REGISTER_KERNELS(CPU, float);
-REGISTER_KERNELS(CPU, double);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);

 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
@ -1316,6 +1820,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
 #endif
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS

 template <typename Device, typename T>
@ -1352,7 +1857,7 @@ class ApplyRMSPropOp : public OpKernel {
    const Tensor& grad = ctx->input(7);

    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
-                errors::InvalidArgument("lr is not a scalar: ",
+                errors::InvalidArgument("lr is not a scalar : ",
                                        lr.shape().DebugString()));
    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(rho.shape()),
                errors::InvalidArgument("rho is not a scalar: ",
@ -1400,10 +1905,11 @@ typedef Eigen::GpuDevice GPUDevice;
  REGISTER_KERNEL_BUILDER(                                            \
      Name("ApplyRMSProp").Device(DEVICE_##D).TypeConstraint<T>("T"), \
      ApplyRMSPropOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);

-REGISTER_KERNELS(CPU, Eigen::half);
-REGISTER_KERNELS(CPU, float);
-REGISTER_KERNELS(CPU, double);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);

 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
@ -1428,6 +1934,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
 #endif
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@ -44,6 +44,24 @@ struct ApplyAdadelta {
                  typename TTypes<T>::ConstFlat grad);
 };

+template <typename Device, typename T>
+struct FobosElasticNet {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyProximalGradientDescent {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
 template <typename Device, typename T>
 struct ApplyAdagrad {
  void operator()(const Device& d, typename TTypes<T>::Flat var,
@ -52,6 +70,16 @@ struct ApplyAdagrad {
                  typename TTypes<T>::ConstFlat grad);
 };

+template <typename Device, typename T>
+struct ApplyProximalAdagrad {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
 template <typename Device, typename T>
 struct ApplyFtrl {
  void operator()(const Device& d, typename TTypes<T>::Flat var,
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/xent_op.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"

@ -86,18 +87,14 @@ struct XentFunctor<CPUDevice, T> {
 };
 }  // namespace functor

-REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<Eigen::half>("T"),
-                        SoftmaxXentWithLogitsOp<CPUDevice, Eigen::half>);
-REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        SoftmaxXentWithLogitsOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T"),
-                        SoftmaxXentWithLogitsOp<CPUDevice, double>);
+#define REGISTER_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") \
+                              .Device(DEVICE_CPU)               \
+                              .TypeConstraint<T>("T"),          \
+                          SoftmaxXentWithLogitsOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);

 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
--- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
@ -1825,6 +1825,127 @@ op {
    }
  }
 }
+op {
+  name: "ApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
  name: "ApplyRMSProp"
  input_arg {
@ -3011,6 +3132,63 @@ op {
    }
  }
 }
+op {
+  name: "AvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
  name: "AvgPool3D"
  input_arg {
@ -3232,6 +3410,67 @@ op {
    }
  }
 }
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
  name: "BatchCholesky"
  input_arg {
@ -11801,6 +12040,124 @@ op {
    }
  }
 }
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
 op {
  name: "MaxPool3D"
  input_arg {
@ -12014,6 +12371,73 @@ op {
    }
  }
 }
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
 op {
  name: "MaxPoolGradWithArgmax"
  input_arg {
@ -12065,6 +12489,70 @@ op {
    }
  }
 }
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
 op {
  name: "MaxPoolWithArgmax"
  input_arg {
@ -12115,6 +12603,69 @@ op {
    }
  }
 }
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
 op {
  name: "Maximum"
  input_arg {
@ -20499,6 +21050,155 @@ op {
    }
  }
 }
+op {
+  name: "SparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
  name: "SparseConcat"
  input_arg {
--- a/tensorflow/core/ops/nn_grad.cc
+++ b/tensorflow/core/ops/nn_grad.cc
@ -154,22 +154,25 @@ Status MaxPoolGrad(const AttrSlice& attrs, FunctionDef* g) {
  // clang-format off
  *g = FDH::Define(
    // Arg defs
-    {"input: float", "grad: float"},
+    {"input: T", "grad: T"},
    // Ret val defs
-    {"output: float"},
+    {"output: T"},
    // Attr defs
-    {"ksize: list(int) >= 4",
+    {"T: {float, half} = DT_FLOAT",
+     "ksize: list(int) >= 4",
     "strides: list(int) >= 4",
     GetPaddingAttrString()},
    // Nodes
    {
      // Invoke MaxPool again to recompute the outputs (removed by CSE?).
      {{"maxpool"}, "MaxPool", {"input"},
-       /*Attrs=*/{{"ksize", "$ksize"},
+       /*Attrs=*/{{"T", "$T"},
+                  {"ksize", "$ksize"},
                  {"strides", "$strides"},
                  {"padding", "$padding"}}},
      {{"output"}, "MaxPoolGrad", {"input", "maxpool", "grad"},
-       /*Attrs=*/{{"ksize", "$ksize"},
+       /*Attrs=*/{{"T", "$T"},
+                  {"ksize", "$ksize"},
                  {"strides", "$strides"},
                  {"padding", "$padding"}}}
    });
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@ -28,7 +28,7 @@ REGISTER_OP("AvgPool")
    .Attr("strides: list(int) >= 4")
    .Attr(GetPaddingAttrString())
    .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {float, double}")
+    .Attr("T: {float, half, double}")
    .Doc(R"doc(
 Performs average pooling on the input.

@ -55,7 +55,7 @@ REGISTER_OP("AvgPoolGrad")
    .Attr("strides: list(int) >= 4")
    .Attr(GetPaddingAttrString())
    .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {float, double}")
+    .Attr("T: {float, half, double}")
    .Doc(R"doc(
 Computes gradients of the average pooling function.

@ -642,12 +642,13 @@ output: The gradients for LRN.
 // --------------------------------------------------------------------------

 REGISTER_OP("MaxPool")
+    .Attr("T: {float, half} = DT_FLOAT")
    .Attr("ksize: list(int) >= 4")
    .Attr("strides: list(int) >= 4")
    .Attr(GetPaddingAttrString())
    .Attr(GetConvnetDataFormatAttrString())
-    .Input("input: float")
-    .Output("output: float")
+    .Input("input: T")
+    .Output("output: T")
    .Doc(R"doc(
 Performs max pooling on the input.

@ -669,10 +670,11 @@ REGISTER_OP("MaxPoolGrad")
    .Attr("strides: list(int) >= 4")
    .Attr(GetPaddingAttrString())
    .Attr(GetConvnetDataFormatAttrString())
-    .Input("orig_input: float")
-    .Input("orig_output: float")
-    .Input("grad: float")
-    .Output("output: float")
+    .Input("orig_input: T")
+    .Input("orig_output: T")
+    .Input("grad: T")
+    .Output("output: T")
+    .Attr("T: {float, half} = DT_FLOAT")
    .Doc(R"doc(
 Computes gradients of the maxpooling function.

@ -696,9 +698,10 @@ REGISTER_OP("MaxPoolWithArgmax")
    .Attr("strides: list(int) >= 4")
    .Attr("Targmax: {int32, int64} = DT_INT64")
    .Attr(GetPaddingAttrString())
-    .Input("input: float")
-    .Output("output: float")
+    .Input("input: T")
+    .Output("output: T")
    .Output("argmax: Targmax")
+    .Attr("T: {float, half} = DT_FLOAT")
    .Doc(R"doc(
 Performs max pooling on the input and outputs both max values and indices.

@ -720,10 +723,11 @@ REGISTER_OP("MaxPoolGradWithArgmax")
    .Attr("strides: list(int) >= 4")
    .Attr(GetPaddingAttrString())
    .Attr("Targmax: {int32, int64}")
-    .Input("input: float")
-    .Input("grad: float")
+    .Input("input: T")
+    .Input("grad: T")
    .Input("argmax: Targmax")
-    .Output("output: float")
+    .Output("output: T")
+    .Attr("T: {float, half} = DT_FLOAT")
    .Doc(R"doc(
 Computes gradients of the maxpooling function.

--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@ -591,12 +591,12 @@ op {
  }
  input_arg {
    name: "l1"
-    description: "Scaling factor. Must be a scalar."
+    description: "L1 regulariation. Must be a scalar."
    type_attr: "T"
  }
  input_arg {
    name: "l2"
-    description: "Scaling factor. Must be a scalar."
+    description: "L2 regulariation. Must be a scalar."
    type_attr: "T"
  }
  input_arg {
@ -767,6 +767,146 @@ op {
  summary: "Update \'*var\' according to the momentum scheme."
  description: "accum = accum * momentum + grad\nvar -= lr * accum"
 }
+op {
+  name: "ApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    description: "Same as \"var\"."
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Update \'*var\' and \'*accum\' according to FOBOS with Adagrad learning rate."
+  description: "accum += grad * grad\nprox_v = var - lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}"
+}
+op {
+  name: "ApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    description: "The change."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    description: "Same as \"var\"."
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Update \'*var\' as FOBOS algorithm with fixed learning rate."
+  description: "prox_v = var - alpha * delta\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}"
+}
 op {
  name: "ApplyRMSProp"
  input_arg {
@ -1251,6 +1391,7 @@ op {
    allowed_values {
      list {
        type: DT_FLOAT
+        type: DT_HALF
        type: DT_DOUBLE
      }
    }
@ -1447,6 +1588,7 @@ op {
    allowed_values {
      list {
        type: DT_FLOAT
+        type: DT_HALF
        type: DT_DOUBLE
      }
    }
@ -6614,12 +6756,25 @@ op {
  input_arg {
    name: "input"
    description: "4-D input to pool over."
-    type: DT_FLOAT
+    type_attr: "T"
  }
  output_arg {
    name: "output"
    description: "The max pooled output tensor."
-    type: DT_FLOAT
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
  }
  attr {
    name: "ksize"
@ -6798,22 +6953,22 @@ op {
  input_arg {
    name: "orig_input"
    description: "The original input tensor."
-    type: DT_FLOAT
+    type_attr: "T"
  }
  input_arg {
    name: "orig_output"
    description: "The original output tensor."
-    type: DT_FLOAT
+    type_attr: "T"
  }
  input_arg {
    name: "grad"
    description: "4-D.  Gradients w.r.t. the output of `max_pool`."
-    type: DT_FLOAT
+    type_attr: "T"
  }
  output_arg {
    name: "output"
    description: "Gradients w.r.t. the input to `max_pool`."
-    type: DT_FLOAT
+    type_attr: "T"
  }
  attr {
    name: "ksize"
@ -6854,6 +7009,19 @@ op {
      }
    }
  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
  summary: "Computes gradients of the maxpooling function."
 }
 op {
@ -6861,12 +7029,12 @@ op {
  input_arg {
    name: "input"
    description: "The original input."
-    type: DT_FLOAT
+    type_attr: "T"
  }
  input_arg {
    name: "grad"
    description: "4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the\noutput of `max_pool`."
-    type: DT_FLOAT
+    type_attr: "T"
  }
  input_arg {
    name: "argmax"
@ -6876,7 +7044,7 @@ op {
  output_arg {
    name: "output"
    description: "Gradients w.r.t. the input of `max_pool`."
-    type: DT_FLOAT
+    type_attr: "T"
  }
  attr {
    name: "ksize"
@ -6913,6 +7081,19 @@ op {
      }
    }
  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
  summary: "Computes gradients of the maxpooling function."
 }
 op {
@ -6920,12 +7101,12 @@ op {
  input_arg {
    name: "input"
    description: "4-D with shape `[batch, height, width, channels]`.  Input to pool over."
-    type: DT_FLOAT
+    type_attr: "T"
  }
  output_arg {
    name: "output"
    description: "The max pooled output tensor."
-    type: DT_FLOAT
+    type_attr: "T"
  }
  output_arg {
    name: "argmax"
@ -6970,6 +7151,19 @@ op {
      }
    }
  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
  summary: "Performs max pooling on the input and outputs both max values and indices."
  description: "The indices in `argmax` are flattened, so that a maximum value at position\n`[b, y, x, c]` becomes flattened index\n`((b * height + y) * width + x) * channels + c`."
 }
@ -11380,12 +11574,12 @@ op {
  }
  input_arg {
    name: "l1"
-    description: "Scaling factor. Must be a scalar."
+    description: "L1 regularization. Must be a scalar."
    type_attr: "T"
  }
  input_arg {
    name: "l2"
-    description: "Scaling factor. Must be a scalar."
+    description: "L2 regularization. Must be a scalar."
    type_attr: "T"
  }
  input_arg {
@ -11525,6 +11719,176 @@ op {
  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme."
  description: "That is for rows we have grad for, we update var and accum as follows:\n\naccum = accum * momentum + grad\nvar -= lr * accum"
 }
+op {
+  name: "SparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    description: "Learning rate. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var and accum."
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    description: "Same as \"var\"."
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Sparse update entries in \'*var\' and \'*accum\' according to FOBOS algorithm."
+  description: "That is for rows we have grad for, we update var and accum as follows:\naccum += grad * grad\nprox_v = var\nprox_v -= lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}"
+}
+op {
+  name: "SparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var and accum."
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    description: "Same as \"var\"."
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Sparse update \'*var\' as FOBOS algorithm with fixed learning rate."
+  description: "That is for rows we have grad for, we update var as follows:\nprox_v = var - alpha * grad\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}"
+}
 op {
  name: "SparseConcat"
  input_arg {
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@ -35,6 +35,59 @@ use_locking: If `True`, the subtraction will be protected by a lock;
  otherwise the behavior is undefined, but may exhibit less contention.
 )doc");

+REGISTER_OP("ApplyProximalGradientDescent")
+    .Input("var: Ref(T)")
+    .Input("alpha: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("delta: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Doc(R"doc(
+Update '*var' as FOBOS algorithm with fixed learning rate.
+prox_v = var - alpha * delta
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+delta: The change.
+out: Same as "var".
+use_locking: If True, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("SparseApplyProximalGradientDescent")
+    .Input("var: Ref(T)")
+    .Input("alpha: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .Doc(R"doc(
+Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+
+That is for rows we have grad for, we update var as follows:
+prox_v = var - alpha * grad
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+out: Same as "var".
+use_locking: If True, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
 REGISTER_OP("ApplyAdadelta")
    .Input("var: Ref(T)")
    .Input("accum: Ref(T)")
@ -117,6 +170,33 @@ use_locking: If `True`, updating of the var and accum tensors will be protected
  contention.
 )doc");

+REGISTER_OP("ApplyProximalAdagrad")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Doc(R"doc(
+Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+accum += grad * grad
+prox_v = var - lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+out: Same as "var".
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
 REGISTER_OP("SparseApplyAdagrad")
    .Input("var: Ref(T)")
    .Input("accum: Ref(T)")
@ -145,6 +225,39 @@ use_locking: If `True`, updating of the var and accum tensors will be protected
  contention.
 )doc");

+REGISTER_OP("SparseApplyProximalAdagrad")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .Doc(R"doc(
+Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+
+That is for rows we have grad for, we update var and accum as follows:
+accum += grad * grad
+prox_v = var
+prox_v -= lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Learning rate. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+out: Same as "var".
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
 REGISTER_OP("ApplyFtrl")
    .Input("var: Ref(T)")
    .Input("accum: Ref(T)")
@ -171,8 +284,8 @@ accum: Should be from a Variable().
 linear: Should be from a Variable().
 grad: The gradient.
 lr: Scaling factor. Must be a scalar.
-l1: Scaling factor. Must be a scalar.
-l2: Scaling factor. Must be a scalar.
+l1: L1 regulariation. Must be a scalar.
+l2: L2 regulariation. Must be a scalar.
 lr_power: Scaling factor. Must be a scalar.
 out: Same as "var".
 use_locking: If `True`, updating of the var and accum tensors will be protected
@ -210,8 +323,8 @@ linear: Should be from a Variable().
 grad: The gradient.
 indices: A vector of indices into the first dimension of var and accum.
 lr: Scaling factor. Must be a scalar.
-l1: Scaling factor. Must be a scalar.
-l2: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
 lr_power: Scaling factor. Must be a scalar.
 out: Same as "var".
 use_locking: If `True`, updating of the var and accum tensors will be protected
--- a/tensorflow/core/util/stat_summarizer.h
+++ b/tensorflow/core/util/stat_summarizer.h
@ -69,9 +69,9 @@ class Stat {
                   : static_cast<HighPrecisionValueType>(sum_) / count_;
  }

-  ValueType rms() const { return sqrt(squared_sum_ / count_); }
-
-  ValueType std_deviation() const { return all_same() ? 0 : rms() - avg(); }
+  ValueType std_deviation() const {
+    return all_same() ? 0 : sqrt(squared_sum_ / count_ - avg() * avg());
+  }

  void OutputToStream(std::ostream* stream) const {
    if (empty()) {
--- a/tensorflow/examples/skflow/iris_val_based_early_stopping.py
+++ b/tensorflow/examples/skflow/iris_val_based_early_stopping.py
@ -18,35 +18,38 @@ from __future__ import print_function
 from sklearn import datasets
 from sklearn import metrics
 from sklearn.cross_validation import train_test_split
+import tensorflow as tf

 from tensorflow.contrib import learn


-iris = datasets.load_iris()
-X_train, X_test, y_train, y_test = train_test_split(iris.data,
-                                                    iris.target,
-                                                    test_size=0.2,
-                                                    random_state=42)
+def main(unused_argv):
+  iris = datasets.load_iris()
+  x_train, x_test, y_train, y_test = train_test_split(
+      iris.data, iris.target, test_size=0.2, random_state=42)

-X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
-                                                  test_size=0.2,
-                                                  random_state=42)
-val_monitor = learn.monitors.ValidationMonitor(X_val, y_val,
-                                               early_stopping_rounds=200)
+  x_train, x_val, y_train, y_val = train_test_split(
+      x_train, y_train, test_size=0.2, random_state=42)
+  val_monitor = learn.monitors.ValidationMonitor(
+      x_val, y_val, early_stopping_rounds=200)

-# classifier with early stopping on training data
-classifier1 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
-                                            n_classes=3,
-                                            model_dir='/tmp/iris_model/')
-classifier1.fit(X_train, y_train, steps=2000)
-score1 = metrics.accuracy_score(y_test, classifier1.predict(X_test))
+  # classifier with early stopping on training data
+  classifier1 = learn.TensorFlowDNNClassifier(
+      hidden_units=[10, 20, 10], n_classes=3, model_dir='/tmp/iris_model/')
+  classifier1.fit(x=x_train, y=y_train, steps=2000)
+  score1 = metrics.accuracy_score(y_test, classifier1.predict(x_test))

-# classifier with early stopping on validation data
-classifier2 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
-                                            n_classes=3,
-                                            model_dir='/tmp/iris_model_val/')
-classifier2.fit(X_train, y_train, val_monitor, steps=2000)
-score2 = metrics.accuracy_score(y_test, classifier2.predict(X_test))
+  # classifier with early stopping on validation data, save frequently for
+  # monitor to pick up new checkpoints.
+  classifier2 = learn.TensorFlowDNNClassifier(
+      hidden_units=[10, 20, 10], n_classes=3, model_dir='/tmp/iris_model_val/',
+      config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
+  classifier2.fit(x=x_train, y=y_train, steps=2000, monitors=[val_monitor])
+  score2 = metrics.accuracy_score(y_test, classifier2.predict(x_test))

-# In many applications, the score is improved by using early stopping
-print(score2 > score1)
+  # In many applications, the score is improved by using early stopping
+  print(score2 > score1)
+
+
+if __name__ == '__main__':
+  tf.app.run()
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@ -164,7 +164,7 @@ def train():
                              feed_dict=feed_dict(True),
                              options=run_options,
                              run_metadata=run_metadata)
-        train_writer.add_run_metadata(run_metadata, 'step%d' % i)
+        train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
        train_writer.add_summary(summary, i)
        print('Adding run metadata for', i)
      else:  # Record a summary
--- a/tensorflow/g3doc/api_docs/python/array_ops.md
+++ b/tensorflow/g3doc/api_docs/python/array_ops.md
@ -216,7 +216,7 @@ This operation returns a 1-D integer tensor representing the shape of `input`.

 For example:

-```prettyprint
+```python
 # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
 shape(t) ==> [2, 2, 3]
 ```
@ -224,7 +224,7 @@ shape(t) ==> [2, 2, 3]
 ##### Args:


-*  <b>`input`</b>: A `Tensor`.
+*  <b>`input`</b>: A `Tensor` or `SparseTensor`.
 *  <b>`name`</b>: A name for the operation (optional).

 ##### Returns:
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.learn.md
@ -796,11 +796,11 @@ A classifier for TensorFlow DNN models.

    def input_fn_eval: # returns x, Y
      pass
-    estimator.evaluate(input_fn_eval)
-    estimator.predict(x)
+    estimator.evaluate(input_fn=input_fn_eval)
+    estimator.predict(x=x)
    ```

-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
    otherwise there will be a `KeyError`:
      if `weight_column_name` is not `None`, a feature with
        `key=weight_column_name` whose value is a `Tensor`.
@ -1140,11 +1140,11 @@ A regressor for TensorFlow DNN models.

    def input_fn_eval: # returns x, Y
      pass
-    estimator.evaluate(input_fn_eval)
-    estimator.predict(x)
+    estimator.evaluate(input_fn=input_fn_eval)
+    estimator.predict(x=x)
    ```

-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
    otherwise there will be a `KeyError`:
      if `weight_column_name` is not `None`, a feature with
        `key=weight_column_name` whose value is a `Tensor`.
@ -2245,10 +2245,10 @@ Linear classifier model.
    ...
  estimator.fit(input_fn=input_fn_train)
  estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x)
+  estimator.predict(x=x)
  ```

-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
    otherwise there will be a `KeyError`:
      if `weight_column_name` is not `None`, a feature with
        `key=weight_column_name` whose value is a `Tensor`.
@ -2579,10 +2579,10 @@ Linear regressor model.
    ...
  estimator.fit(input_fn=input_fn_train)
  estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x)
+  estimator.predict(x=x)
  ```

-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
    otherwise there will be a KeyError:
      if `weight_column_name` is not None:
        key=weight_column_name, value=a `Tensor`
@ -4238,50 +4238,33 @@ Perform various training, evaluation, and inference actions on a graph.
 ### `class tf.contrib.learn.RunConfig` {#RunConfig}

 This class specifies the specific configurations for the run.
-
-Parameters:
-  execution_mode: Runners use this flag to execute different tasks, like
-    training vs evaluation. 'all' (the default) executes both training and
-    eval.
-  master: TensorFlow master. Empty string (the default) for local.
-  task: Task id of the replica running the training (default: 0).
-  num_ps_replicas: Number of parameter server tasks to use (default: 0).
-  training_worker_session_startup_stagger_secs: Seconds to sleep between the
-    startup of each worker task session (default: 5).
-  training_worker_max_startup_secs: Max seconds to wait before starting any
-    worker (default: 60).
-  eval_delay_secs: Number of seconds between the beginning of each eval run.
-    If one run takes more than this amount of time, the next run will start
-    immediately once that run completes (default 60).
-  eval_steps: Number of steps to run in each eval (default: 100).
-  num_cores: Number of cores to be used (default: 4).
-  verbose: Controls the verbosity, possible values:
-    0: the algorithm and debug information is muted.
-    1: trainer prints the progress.
-    2: log device placement is printed.
-  gpu_memory_fraction: Fraction of GPU memory used by the process on
-    each GPU uniformly on the same machine.
-  tf_random_seed: Random seed for TensorFlow initializers.
-    Setting this value allows consistency between reruns.
-  keep_checkpoint_max: The maximum number of recent checkpoint files to keep.
-    As new files are created, older files are deleted.
-    If None or 0, all checkpoint files are kept.
-    Defaults to 5 (that is, the 5 most recent checkpoint files are kept.)
-  keep_checkpoint_every_n_hours: Number of hours between each checkpoint
-    to be saved. The default value of 10,000 hours effectively disables
-    the feature.
-
-Attributes:
-  tf_master: Tensorflow master.
-  tf_config: Tensorflow Session Config proto.
-  tf_random_seed: Tensorflow random seed.
-  keep_checkpoint_max: Maximum number of checkpoints to keep.
-  keep_checkpoint_every_n_hours: Number of hours between each checkpoint.
 - - -

-#### `tf.contrib.learn.RunConfig.__init__(execution_mode='all', master='', task=0, num_ps_replicas=0, training_worker_session_startup_stagger_secs=5, training_worker_max_startup_secs=60, eval_delay_secs=60, eval_steps=100, num_cores=4, verbose=1, gpu_memory_fraction=1, tf_random_seed=42, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__}
+#### `tf.contrib.learn.RunConfig.__init__(master='', task=0, num_ps_replicas=0, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=42, save_summary_steps=100, save_checkpoints_secs=60, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__}
+
+Constructor.
+
+##### Args:


+*  <b>`master`</b>: TensorFlow master. Empty string (the default) for local.
+*  <b>`task`</b>: Task id of the replica running the training (default: 0).
+*  <b>`num_ps_replicas`</b>: Number of parameter server tasks to use (default: 0).
+*  <b>`num_cores`</b>: Number of cores to be used (default: 4).
+*  <b>`log_device_placement`</b>: Log the op placement to devices (default: False).
+*  <b>`gpu_memory_fraction`</b>: Fraction of GPU memory used by the process on
+    each GPU uniformly on the same machine.
+*  <b>`tf_random_seed`</b>: Random seed for TensorFlow initializers.
+    Setting this value allows consistency between reruns.
+*  <b>`save_summary_steps`</b>: Save summaries every this many steps.
+*  <b>`save_checkpoints_secs`</b>: Save checkpoints every this many seconds.
+*  <b>`keep_checkpoint_max`</b>: The maximum number of recent checkpoint files to
+    keep. As new files are created, older files are deleted. If None or 0,
+    all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
+    checkpoint files are kept.)
+*  <b>`keep_checkpoint_every_n_hours`</b>: Number of hours between each checkpoint
+    to be saved. The default value of 10,000 hours effectively disables
+    the feature.



@ -4394,7 +4377,7 @@ Run `output_dict` tensors `n` times, with the same `feed_dict` each run.

 - - -

-### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, max_steps=None, fail_on_nan_loss=True, monitors=None)` {#train}
+### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, steps=None, fail_on_nan_loss=True, monitors=None)` {#train}

 Train a model.

@ -4437,7 +4420,7 @@ program is terminated with exit code 1.
    `supervisor_save_summaries_steps` seconds when training.
 *  <b>`feed_fn`</b>: A function that is called every iteration to produce a `feed_dict`
    passed to `session.run` calls. Optional.
-*  <b>`max_steps`</b>: Train until `global_step_tensor` evaluates to this value.
+*  <b>`steps`</b>: Trains for this many steps (e.g. current global step + `steps`).
 *  <b>`fail_on_nan_loss`</b>: If true, raise `NanLossDuringTrainingError` if `loss_op`
    evaluates to `NaN`. If false, continue training as if nothing happened.
 *  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
@ -18,10 +18,10 @@ Linear regressor model.
    ...
  estimator.fit(input_fn=input_fn_train)
  estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x)
+  estimator.predict(x=x)
  ```

-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
    otherwise there will be a KeyError:
      if `weight_column_name` is not None:
        key=weight_column_name, value=a `Tensor`
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
@ -18,10 +18,10 @@ Linear classifier model.
    ...
  estimator.fit(input_fn=input_fn_train)
  estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x)
+  estimator.predict(x=x)
  ```

-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
    otherwise there will be a `KeyError`:
      if `weight_column_name` is not `None`, a feature with
        `key=weight_column_name` whose value is a `Tensor`.
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.train.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.train.md
@ -1,4 +1,4 @@
-### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, max_steps=None, fail_on_nan_loss=True, monitors=None)` {#train}
+### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, steps=None, fail_on_nan_loss=True, monitors=None)` {#train}

 Train a model.

@ -41,7 +41,7 @@ program is terminated with exit code 1.
    `supervisor_save_summaries_steps` seconds when training.
 *  <b>`feed_fn`</b>: A function that is called every iteration to produce a `feed_dict`
    passed to `session.run` calls. Optional.
-*  <b>`max_steps`</b>: Train until `global_step_tensor` evaluates to this value.
+*  <b>`steps`</b>: Trains for this many steps (e.g. current global step + `steps`).
 *  <b>`fail_on_nan_loss`</b>: If true, raise `NanLossDuringTrainingError` if `loss_op`
    evaluates to `NaN`. If false, continue training as if nothing happened.
 *  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.shape.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.shape.md
@ -6,7 +6,7 @@ This operation returns a 1-D integer tensor representing the shape of `input`.

 For example:

-```prettyprint
+```python
 # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
 shape(t) ==> [2, 2, 3]
 ```
@ -14,7 +14,7 @@ shape(t) ==> [2, 2, 3]
 ##### Args:


-*  <b>`input`</b>: A `Tensor`.
+*  <b>`input`</b>: A `Tensor` or `SparseTensor`.
 *  <b>`name`</b>: A name for the operation (optional).

 ##### Returns:
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
@ -21,11 +21,11 @@ A classifier for TensorFlow DNN models.

    def input_fn_eval: # returns x, Y
      pass
-    estimator.evaluate(input_fn_eval)
-    estimator.predict(x)
+    estimator.evaluate(input_fn=input_fn_eval)
+    estimator.predict(x=x)
    ```

-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
    otherwise there will be a `KeyError`:
      if `weight_column_name` is not `None`, a feature with
        `key=weight_column_name` whose value is a `Tensor`.
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md
@ -1,47 +1,30 @@
 This class specifies the specific configurations for the run.
+- - -

-Parameters:
-  execution_mode: Runners use this flag to execute different tasks, like
-    training vs evaluation. 'all' (the default) executes both training and
-    eval.
-  master: TensorFlow master. Empty string (the default) for local.
-  task: Task id of the replica running the training (default: 0).
-  num_ps_replicas: Number of parameter server tasks to use (default: 0).
-  training_worker_session_startup_stagger_secs: Seconds to sleep between the
-    startup of each worker task session (default: 5).
-  training_worker_max_startup_secs: Max seconds to wait before starting any
-    worker (default: 60).
-  eval_delay_secs: Number of seconds between the beginning of each eval run.
-    If one run takes more than this amount of time, the next run will start
-    immediately once that run completes (default 60).
-  eval_steps: Number of steps to run in each eval (default: 100).
-  num_cores: Number of cores to be used (default: 4).
-  verbose: Controls the verbosity, possible values:
-    0: the algorithm and debug information is muted.
-    1: trainer prints the progress.
-    2: log device placement is printed.
-  gpu_memory_fraction: Fraction of GPU memory used by the process on
+#### `tf.contrib.learn.RunConfig.__init__(master='', task=0, num_ps_replicas=0, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=42, save_summary_steps=100, save_checkpoints_secs=60, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__}
+
+Constructor.
+
+##### Args:
+
+
+*  <b>`master`</b>: TensorFlow master. Empty string (the default) for local.
+*  <b>`task`</b>: Task id of the replica running the training (default: 0).
+*  <b>`num_ps_replicas`</b>: Number of parameter server tasks to use (default: 0).
+*  <b>`num_cores`</b>: Number of cores to be used (default: 4).
+*  <b>`log_device_placement`</b>: Log the op placement to devices (default: False).
+*  <b>`gpu_memory_fraction`</b>: Fraction of GPU memory used by the process on
    each GPU uniformly on the same machine.
-  tf_random_seed: Random seed for TensorFlow initializers.
+*  <b>`tf_random_seed`</b>: Random seed for TensorFlow initializers.
    Setting this value allows consistency between reruns.
-  keep_checkpoint_max: The maximum number of recent checkpoint files to keep.
-    As new files are created, older files are deleted.
-    If None or 0, all checkpoint files are kept.
-    Defaults to 5 (that is, the 5 most recent checkpoint files are kept.)
-  keep_checkpoint_every_n_hours: Number of hours between each checkpoint
+*  <b>`save_summary_steps`</b>: Save summaries every this many steps.
+*  <b>`save_checkpoints_secs`</b>: Save checkpoints every this many seconds.
+*  <b>`keep_checkpoint_max`</b>: The maximum number of recent checkpoint files to
+    keep. As new files are created, older files are deleted. If None or 0,
+    all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
+    checkpoint files are kept.)
+*  <b>`keep_checkpoint_every_n_hours`</b>: Number of hours between each checkpoint
    to be saved. The default value of 10,000 hours effectively disables
    the feature.

-Attributes:
-  tf_master: Tensorflow master.
-  tf_config: Tensorflow Session Config proto.
-  tf_random_seed: Tensorflow random seed.
-  keep_checkpoint_max: Maximum number of checkpoints to keep.
-  keep_checkpoint_every_n_hours: Number of hours between each checkpoint.
- - -
-
-#### `tf.contrib.learn.RunConfig.__init__(execution_mode='all', master='', task=0, num_ps_replicas=0, training_worker_session_startup_stagger_secs=5, training_worker_max_startup_secs=60, eval_delay_secs=60, eval_steps=100, num_cores=4, verbose=1, gpu_memory_fraction=1, tf_random_seed=42, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__}
-
-
-

--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md
@ -18,21 +18,28 @@ a probability distribution for each entry, see
 on `logits` internally for efficiency.  Do not call this op with the
 output of `softmax`, as it will produce incorrect results.

-`logits` must have the shape `[batch_size, num_classes]`
-and dtype `float32` or `float64`.
-
-`labels` must have the shape `[batch_size]` and dtype `int32` or `int64`.
+A common use case is to have logits of shape `[batch_size, num_classes]` and
+labels of shape `[batch_size]`. But higher dimensions are supported.

 ##### Args:


-*  <b>`logits`</b>: Unscaled log probabilities.
-*  <b>`labels`</b>: Each entry `labels[i]` must be an index in `[0, num_classes)`. Other
-    values will result in a loss of 0, but incorrect gradient computations.
+*  <b>`logits`</b>: Unscaled log probabilities of rank `r` and shape
+    `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
+*  <b>`labels`</b>: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
+    `int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
+    Other values will result in a loss of 0, but incorrect gradient
+    computations.
 *  <b>`name`</b>: A name for the operation (optional).

 ##### Returns:

-  A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-  softmax cross entropy loss.
+  A `Tensor` of the same shape as `labels` and of the same type as `logits`
+  with the softmax cross entropy loss.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If logits are scalars (need to have rank >= 1) or if the rank
+    of the labels is not equal to the rank of the labels minus one.

--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
@ -21,11 +21,11 @@ A regressor for TensorFlow DNN models.

    def input_fn_eval: # returns x, Y
      pass
-    estimator.evaluate(input_fn_eval)
-    estimator.predict(x)
+    estimator.evaluate(input_fn=input_fn_eval)
+    estimator.predict(x=x)
    ```

-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
    otherwise there will be a `KeyError`:
      if `weight_column_name` is not `None`, a feature with
        `key=weight_column_name` whose value is a `Tensor`.
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.max_pool_with_argmax.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.max_pool_with_argmax.md
@ -9,7 +9,7 @@ The indices in `argmax` are flattened, so that a maximum value at position
 ##### Args:


-*  <b>`input`</b>: A `Tensor` of type `float32`.
+*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `half`.
    4-D with shape `[batch, height, width, channels]`.  Input to pool over.
 *  <b>`ksize`</b>: A list of `ints` that has length `>= 4`.
    The size of the window for each dimension of the input tensor.
@ -25,6 +25,6 @@ The indices in `argmax` are flattened, so that a maximum value at position

  A tuple of `Tensor` objects (output, argmax).

-*  <b>`output`</b>: A `Tensor` of type `float32`. The max pooled output tensor.
+*  <b>`output`</b>: A `Tensor`. Has the same type as `input`. The max pooled output tensor.
 *  <b>`argmax`</b>: A `Tensor` of type `Targmax`. 4-D.  The flattened indices of the max values chosen for each output.

--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_mask.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_mask.md
@ -3,8 +3,8 @@
 Masks elements of `IndexedSlices`.

 Given an `IndexedSlices` instance `a`, returns another `IndexedSlices` that
-contains a subset of the slices of `a`. Only the slices at indices specified
-in `mask_indices` are returned.
+contains a subset of the slices of `a`. Only the slices at indices not
+specified in `mask_indices` are returned.

 This is useful when you need to extract a subset of slices in an
 `IndexedSlices` object.
@ -18,7 +18,7 @@ a.indices => [12, 26, 37, 45]
 tf.shape(a.values) => [4, 10]

 # `b` will be the subset of `a` slices at its second and third indices, so
-# we want to mask of its first and last indices (which are at absolute
+# we want to mask its first and last indices (which are at absolute
 # indices 12, 45)
 b = tf.sparse_mask(a, [12, 45])

--- a/tensorflow/g3doc/api_docs/python/nn.md
+++ b/tensorflow/g3doc/api_docs/python/nn.md
@ -690,7 +690,7 @@ The indices in `argmax` are flattened, so that a maximum value at position
 ##### Args:


-*  <b>`input`</b>: A `Tensor` of type `float32`.
+*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `half`.
    4-D with shape `[batch, height, width, channels]`.  Input to pool over.
 *  <b>`ksize`</b>: A list of `ints` that has length `>= 4`.
    The size of the window for each dimension of the input tensor.
@ -706,7 +706,7 @@ The indices in `argmax` are flattened, so that a maximum value at position

  A tuple of `Tensor` objects (output, argmax).

-*  <b>`output`</b>: A `Tensor` of type `float32`. The max pooled output tensor.
+*  <b>`output`</b>: A `Tensor`. Has the same type as `input`. The max pooled output tensor.
 *  <b>`argmax`</b>: A `Tensor` of type `Targmax`. 4-D.  The flattened indices of the max values chosen for each output.


@ -1244,23 +1244,30 @@ a probability distribution for each entry, see
 on `logits` internally for efficiency.  Do not call this op with the
 output of `softmax`, as it will produce incorrect results.

-`logits` must have the shape `[batch_size, num_classes]`
-and dtype `float32` or `float64`.
-
-`labels` must have the shape `[batch_size]` and dtype `int32` or `int64`.
+A common use case is to have logits of shape `[batch_size, num_classes]` and
+labels of shape `[batch_size]`. But higher dimensions are supported.

 ##### Args:


-*  <b>`logits`</b>: Unscaled log probabilities.
-*  <b>`labels`</b>: Each entry `labels[i]` must be an index in `[0, num_classes)`. Other
-    values will result in a loss of 0, but incorrect gradient computations.
+*  <b>`logits`</b>: Unscaled log probabilities of rank `r` and shape
+    `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
+*  <b>`labels`</b>: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
+    `int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
+    Other values will result in a loss of 0, but incorrect gradient
+    computations.
 *  <b>`name`</b>: A name for the operation (optional).

 ##### Returns:

-  A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-  softmax cross entropy loss.
+  A `Tensor` of the same shape as `labels` and of the same type as `logits`
+  with the softmax cross entropy loss.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If logits are scalars (need to have rank >= 1) or if the rank
+    of the labels is not equal to the rank of the labels minus one.


 - - -
--- a/tensorflow/g3doc/api_docs/python/state_ops.md
+++ b/tensorflow/g3doc/api_docs/python/state_ops.md
@ -1895,8 +1895,8 @@ Requires `updates.shape = indices.shape + ref.shape[1:]`.
 Masks elements of `IndexedSlices`.

 Given an `IndexedSlices` instance `a`, returns another `IndexedSlices` that
-contains a subset of the slices of `a`. Only the slices at indices specified
-in `mask_indices` are returned.
+contains a subset of the slices of `a`. Only the slices at indices not
+specified in `mask_indices` are returned.

 This is useful when you need to extract a subset of slices in an
 `IndexedSlices` object.
@ -1910,7 +1910,7 @@ a.indices => [12, 26, 37, 45]
 tf.shape(a.values) => [4, 10]

 # `b` will be the subset of `a` slices at its second and third indices, so
-# we want to mask of its first and last indices (which are at absolute
+# we want to mask its first and last indices (which are at absolute
 # indices 12, 45)
 b = tf.sparse_mask(a, [12, 45])

--- a/tensorflow/g3doc/how_tos/reading_data/index.md
+++ b/tensorflow/g3doc/how_tos/reading_data/index.md
@ -10,7 +10,7 @@ There are three main methods of getting data into a TensorFlow program:

 [TOC]

-## Feeding 
+## Feeding

 TensorFlow's feed mechanism lets you inject data into any Tensor in a
 computation graph. A python computation can thus feed data directly into the
@ -377,11 +377,11 @@ Again, the example queue will have some elements queued, so training will
 continue until those are exhausted.  If the example queue is a
 [`RandomShuffleQueue`](../../api_docs/python/io_ops.md#RandomShuffleQueue), say
 because you are using `shuffle_batch` or `shuffle_batch_join`, it normally will
-avoid ever going having fewer than its `min_after_dequeue` attr elements
-buffered.  However, once the queue is closed that restriction will be lifted and
-the queue will eventually empty.  At that point the actual training threads,
-when they try and dequeue from example queue, will start getting `OutOfRange`
-errors and exiting.  Once all the training threads are done,
+avoid ever having fewer than its `min_after_dequeue` attr elements buffered.
+However, once the queue is closed that restriction will be lifted and the queue
+will eventually empty.  At that point the actual training threads, when they
+try and dequeue from example queue, will start getting `OutOfRange` errors and
+exiting.  Once all the training threads are done,
 [`tf.train.Coordinator.join`](../../api_docs/python/train.md#Coordinator.join)
 will return and you can exit cleanly.

--- a/tensorflow/g3doc/how_tos/threading_and_queues/index.md
+++ b/tensorflow/g3doc/how_tos/threading_and_queues/index.md
@ -146,7 +146,7 @@ for step in xrange(1000000):
 # When done, ask the threads to stop.
 coord.request_stop()
 # And wait for them to actually do it.
-coord.join(threads)
+coord.join(enqueue_threads)
 ```

 ## Handling Exceptions
--- a/tensorflow/g3doc/tutorials/recurrent/index.md
+++ b/tensorflow/g3doc/tutorials/recurrent/index.md
@ -178,9 +178,9 @@ https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/get_starte
 [bazel](https://github.com/bazelbuild/bazel)).

 Next:
-```
+```bash
 cd tensorflow/models/rnn/ptb
-python ptb_word_lm --data_path=/tmp/simple-examples/data/ --model small
+python ptb_word_lm.py --data_path=/tmp/simple-examples/data/ --model small
 ```

 There are 3 supported model configurations in the tutorial code: "small",
--- a/tensorflow/models/image/mnist/convolutional.py
+++ b/tensorflow/models/image/mnist/convolutional.py
@ -48,9 +48,19 @@ EVAL_FREQUENCY = 100  # Number of steps between evaluations.


 tf.app.flags.DEFINE_boolean("self_test", False, "True if running a self test.")
+tf.app.flags.DEFINE_boolean('use_fp16', False,
+                            "Use half floats instead of full floats if True.")
 FLAGS = tf.app.flags.FLAGS


+def data_type():
+  """Return the type of the activations, weights, and placeholder variables."""
+  if FLAGS.use_fp16:
+    return tf.float16
+  else:
+    return tf.float32
+
+
 def maybe_download(filename):
  """Download the data from Yann's website, unless it's already here."""
  if not tf.gfile.Exists(WORK_DIRECTORY):
@ -142,11 +152,11 @@ def main(argv=None):  # pylint: disable=unused-argument
  # These placeholder nodes will be fed a batch of training data at each
  # training step using the {feed_dict} argument to the Run() call below.
  train_data_node = tf.placeholder(
-      tf.float32,
+      data_type(),
      shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
  train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,))
  eval_data = tf.placeholder(
-      tf.float32,
+      data_type(),
      shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))

  # The variables below hold all the trainable weights. They are passed an
@ -155,24 +165,24 @@ def main(argv=None):  # pylint: disable=unused-argument
  conv1_weights = tf.Variable(
      tf.truncated_normal([5, 5, NUM_CHANNELS, 32],  # 5x5 filter, depth 32.
                          stddev=0.1,
-                          seed=SEED))
-  conv1_biases = tf.Variable(tf.zeros([32]))
-  conv2_weights = tf.Variable(
-      tf.truncated_normal([5, 5, 32, 64],
-                          stddev=0.1,
-                          seed=SEED))
-  conv2_biases = tf.Variable(tf.constant(0.1, shape=[64]))
+                          seed=SEED, dtype=data_type()))
+  conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type()))
+  conv2_weights = tf.Variable(tf.truncated_normal(
+      [5, 5, 32, 64], stddev=0.1,
+      seed=SEED, dtype=data_type()))
+  conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type()))
  fc1_weights = tf.Variable(  # fully connected, depth 512.
-      tf.truncated_normal(
-          [IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512],
-          stddev=0.1,
-          seed=SEED))
-  fc1_biases = tf.Variable(tf.constant(0.1, shape=[512]))
-  fc2_weights = tf.Variable(
-      tf.truncated_normal([512, NUM_LABELS],
+      tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512],
                          stddev=0.1,
-                          seed=SEED))
-  fc2_biases = tf.Variable(tf.constant(0.1, shape=[NUM_LABELS]))
+                          seed=SEED,
+                          dtype=data_type()))
+  fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type()))
+  fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS],
+                                                stddev=0.1,
+                                                seed=SEED,
+                                                dtype=data_type()))
+  fc2_biases = tf.Variable(tf.constant(
+      0.1, shape=[NUM_LABELS], dtype=data_type()))

  # We will replicate the model structure for the training subgraph, as well
  # as the evaluation subgraphs, while sharing the trainable parameters.
@ -230,7 +240,7 @@ def main(argv=None):  # pylint: disable=unused-argument

  # Optimizer: set up a variable that's incremented once per batch and
  # controls the learning rate decay.
-  batch = tf.Variable(0)
+  batch = tf.Variable(0, dtype=data_type())
  # Decay once per epoch, using an exponential schedule starting at 0.01.
  learning_rate = tf.train.exponential_decay(
      0.01,                # Base learning rate.
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@ -99,6 +99,42 @@ def GetShrunkInceptionMaxPoolShapes(shrink=30):

 class PoolingTest(tf.test.TestCase):

+  def _VerifyOneType(self, pool_func, input_sizes, ksize, strides, padding,
+                     data_format, data_type, expected, use_gpu):
+    """Verifies the output values of the pooling function.
+
+    Args:
+      pool_func: Function to be called, co.MaxPool, co.AvgPool,
+        or the Lua version.
+      input_sizes: Input tensor dimensions.
+      ksize: The kernel size dimensions
+      strides: The stride dimensions
+      padding: Padding type.
+      data_format: The data format we use to run the pooling operation.
+      data_type: The data type to use to run the pooling operation.
+      expected: An array containing the expected operation outputs.
+      use_gpu: Whether we are running on GPU.
+    """
+    total_size = 1
+    for s in input_sizes:
+      total_size *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x = [f * 1.0 for f in range(1, total_size + 1)]
+    with self.test_session(use_gpu=use_gpu) as sess:
+      t = tf.constant(x, shape=input_sizes, dtype=data_type)
+      if data_format == "NCHW":
+        t = NHWCToNCHW(t)
+        ksize = NHWCToNCHW(ksize)
+        strides = NHWCToNCHW(strides)
+      t = pool_func(t, ksize=ksize, strides=strides, padding=padding,
+                    data_format=data_format)
+      if data_format == "NCHW":
+        t = NCHWToNHWC(t)
+      actual = t.eval()
+      self.assertAllCloseAccordingToType(expected, actual.flatten())
+      self.assertShapeEqual(actual, t)
+
  def _VerifyOneTest(self, pool_func, input_sizes, ksize, strides, padding,
                     data_format, expected, use_gpu):
    """Verifies the output values of the pooling function.
@ -114,25 +150,12 @@ class PoolingTest(tf.test.TestCase):
      expected: An array containing the expected operation outputs.
      use_gpu: Whether we are running on GPU.
    """
-    total_size = 1
-    for s in input_sizes:
-      total_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x = [f * 1.0 for f in range(1, total_size + 1)]
-    with self.test_session(use_gpu=use_gpu) as sess:
-      t = tf.constant(x, shape=input_sizes)
-      if data_format == "NCHW":
-        t = NHWCToNCHW(t)
-        ksize = NHWCToNCHW(ksize)
-        strides = NHWCToNCHW(strides)
-      t = pool_func(t, ksize=ksize, strides=strides, padding=padding,
-                    data_format=data_format)
-      if data_format == "NCHW":
-        t = NCHWToNHWC(t)
-      actual = t.eval()
-      self.assertAllClose(expected, actual.flatten())
-      self.assertShapeEqual(actual, t)
+    self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
+                        data_format, tf.float32, expected, use_gpu)
+
+    if not use_gpu or test_util.CudaSupportsHalfMatMulAndConv():
+      self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
+                          data_format, tf.float16, expected, use_gpu)

  def _VerifyValues(self, pool_func, input_sizes, ksize, strides, padding,
                    expected, use_gpu):
@ -372,32 +395,40 @@ class PoolingTest(tf.test.TestCase):

  def testKernelSmallerThanStrideValid(self):
    for use_gpu in [True, False]:
-        self._VerifyValues(tf.nn.max_pool, input_sizes=[1, 7, 7, 1],
-                           ksize=[1, 2, 2, 1], strides=[1, 3, 3, 1],
-                           padding="VALID",
-                           expected=[9, 12, 30, 33],
-                           use_gpu=use_gpu)
+      self._VerifyValues(tf.nn.max_pool,
+                         input_sizes=[1, 7, 7, 1],
+                         ksize=[1, 2, 2, 1],
+                         strides=[1, 3, 3, 1],
+                         padding="VALID",
+                         expected=[9, 12, 30, 33],
+                         use_gpu=use_gpu)

-        self._VerifyValues(tf.nn.avg_pool, input_sizes=[1, 7, 7, 1],
-                           ksize=[1, 2, 2, 1], strides=[1, 3, 3, 1],
-                           padding="VALID",
-                           expected=[5, 8, 26, 29],
-                           use_gpu=use_gpu)
+      self._VerifyValues(tf.nn.avg_pool,
+                         input_sizes=[1, 7, 7, 1],
+                         ksize=[1, 2, 2, 1],
+                         strides=[1, 3, 3, 1],
+                         padding="VALID",
+                         expected=[5, 8, 26, 29],
+                         use_gpu=use_gpu)

  def testKernelSmallerThanStrideSame(self):
    for use_gpu in [True, False]:
-        for pool_func in [tf.nn.max_pool, tf.nn.avg_pool]:
-            self._VerifyValues(pool_func, input_sizes=[1, 3, 3, 1],
-                               ksize=[1, 1, 1, 1], strides=[1, 2, 2, 1],
-                               padding="SAME",
-                               expected=[1, 3, 7, 9],
-                               use_gpu=use_gpu)
+      for pool_func in [tf.nn.max_pool, tf.nn.avg_pool]:
+        self._VerifyValues(pool_func,
+                           input_sizes=[1, 3, 3, 1],
+                           ksize=[1, 1, 1, 1],
+                           strides=[1, 2, 2, 1],
+                           padding="SAME",
+                           expected=[1, 3, 7, 9],
+                           use_gpu=use_gpu)

-            self._VerifyValues(pool_func, input_sizes=[1, 4, 4, 1],
-                               ksize=[1, 1, 1, 1], strides=[1, 2, 2, 1],
-                               padding="SAME",
-                               expected=[1, 3, 9, 11],
-                               use_gpu=use_gpu)
+        self._VerifyValues(pool_func,
+                           input_sizes=[1, 4, 4, 1],
+                           ksize=[1, 1, 1, 1],
+                           strides=[1, 2, 2, 1],
+                           padding="SAME",
+                           expected=[1, 3, 9, 11],
+                           use_gpu=use_gpu)

  def _testDepthwiseMaxPoolInvalidConfig(self, in_size, ksize, strides,
                                         error_msg, use_gpu=False):
@ -425,43 +456,49 @@ class PoolingTest(tf.test.TestCase):
  # The following are tests that verify that the CPU and GPU implementations
  # produce the same resuts.
  def _CompareMaxPoolingFwd(self, input_shape, ksize, strides, padding):
-    tensor_input = np.random.rand(*input_shape).astype(np.float32)
-    with self.test_session(use_gpu=True):
-      t = tf.constant(tensor_input, shape=input_shape)
-      out_op, _ = tf.nn.max_pool_with_argmax(t, ksize, strides, padding)
-      gpu_val = out_op.eval()
-    with self.test_session(use_gpu=False):
-      t = tf.constant(tensor_input, shape=input_shape)
-      out_op = tf.nn.max_pool(t, ksize, strides, padding)
-      cpu_val = out_op.eval()
-    self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
+    for dtype in np.float32, np.float16:
+      tensor_input = np.random.rand(*input_shape).astype(dtype)
+      with self.test_session(use_gpu=True):
+        t = tf.constant(tensor_input, shape=input_shape)
+        out_op, _ = tf.nn.max_pool_with_argmax(t, ksize, strides, padding)
+        gpu_val = out_op.eval()
+      with self.test_session(use_gpu=False):
+        t = tf.constant(tensor_input, shape=input_shape)
+        out_op = tf.nn.max_pool(t, ksize, strides, padding)
+        cpu_val = out_op.eval()
+      self.assertAllCloseAccordingToType(cpu_val, gpu_val)

  def _CompareMaxPoolingBk(self, input_shape, output_shape, ksize, strides,
                           padding):
-    # Generate numbers in a narrow range, so that there are many duplicates
-    # in the input.
-    tensor_input = np.random.random_integers(0, 3,
-                                             input_shape).astype(np.float32)
-    tensor_output = np.random.rand(*output_shape).astype(np.float32)
-    with self.test_session(use_gpu=True):
-      t = tf.constant(tensor_input, shape=input_shape)
-      _, argmax_op = tf.nn.max_pool_with_argmax(t, ksize, strides, padding)
-      argmax = argmax_op.eval()
-      grad_in = tf.constant(tensor_output, shape=output_shape)
-      out_op = gen_nn_ops._max_pool_grad_with_argmax(t, grad_in, argmax,
-                                                     ksize, strides, padding)
-      gpu_val = out_op.eval()
-      self.assertShapeEqual(gpu_val, out_op)
-    with self.test_session(use_gpu=False):
-      t = tf.constant(tensor_input, shape=input_shape)
-      out_op = tf.nn.max_pool(t, ksize, strides, padding)
-      orig_out = out_op.eval()
-      grad_in = tf.constant(tensor_output, shape=output_shape)
-      out_op = gen_nn_ops._max_pool_grad(t, orig_out, grad_in, ksize,
-                                         strides, padding)
-      cpu_val = out_op.eval()
-      self.assertShapeEqual(cpu_val, out_op)
-    self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
+    for dtype in np.float32, np.float16:
+      # Generate numbers in a narrow range, so that there are many duplicates
+      # in the input.
+      tensor_input = np.random.random_integers(0, 3, input_shape).astype(dtype)
+      tensor_output = np.random.rand(*output_shape).astype(dtype)
+      with self.test_session(use_gpu=True):
+        t = tf.constant(tensor_input, shape=input_shape)
+        _, argmax_op = tf.nn.max_pool_with_argmax(t, ksize, strides, padding)
+        argmax = argmax_op.eval()
+        grad_in = tf.constant(tensor_output, shape=output_shape)
+        out_op = gen_nn_ops._max_pool_grad_with_argmax(t, grad_in, argmax,
+                                                       ksize, strides, padding)
+        gpu_val = out_op.eval()
+        self.assertShapeEqual(gpu_val, out_op)
+      with self.test_session(use_gpu=False):
+        t = tf.constant(tensor_input, shape=input_shape)
+        out_op = tf.nn.max_pool(t, ksize, strides, padding)
+        orig_out = out_op.eval()
+        grad_in = tf.constant(tensor_output, shape=output_shape)
+        out_op = gen_nn_ops._max_pool_grad(t, orig_out, grad_in, ksize, strides,
+                                           padding)
+        cpu_val = out_op.eval()
+        self.assertShapeEqual(cpu_val, out_op)
+      if dtype == np.float16:
+        # The CPU version accumulates its gradient on fp16, so it's less
+        # accurate than the GPU version that does the accumulation on fp32
+        self.assertAllClose(cpu_val, gpu_val, rtol=0.01, atol=0.01)
+      else:
+        self.assertAllClose(cpu_val, gpu_val)

  def testMaxPoolingWithArgmax(self):
    # MaxPoolWithArgMax is implemented only on GPU.
--- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
@ -30,6 +30,9 @@ from tensorflow.python.ops import sparse_ops
 class SparseXentTest(tf.test.TestCase):

  def _npXent(self, features, labels):
+    is_higher_dim = len(features.shape) > 2
+    features = np.reshape(features, [-1, features.shape[-1]])
+    labels = np.reshape(labels, [-1])
    batch_dim = 0
    class_dim = 1
    batch_size = features.shape[batch_dim]
@ -40,14 +43,15 @@ class SparseXentTest(tf.test.TestCase):
    labels_mat[np.arange(batch_size), labels] = 1.0
    bp = (probs - labels_mat)
    l = -np.sum(labels_mat * np.log(probs + 1.0e-20), axis=1)
-    return l, bp
+    return l, bp, is_higher_dim

  def _testXent(self, np_features, np_labels, use_gpu=False):
-    np_loss, np_backprop = self._npXent(np_features, np_labels)
+    np_loss, np_backprop, is_higher_dim = self._npXent(np_features, np_labels)
    with self.test_session(use_gpu=use_gpu) as sess:
      loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
          np_features, np_labels)
-      backprop = loss.op.outputs[1]
+      backprop = (loss.op.inputs[0].op.outputs[1] if is_higher_dim
+                  else loss.op.outputs[1])
      tf_loss, tf_backprop = sess.run([loss, backprop])
    self.assertAllCloseAccordingToType(np_loss, tf_loss)
    self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
@ -71,14 +75,6 @@ class SparseXentTest(tf.test.TestCase):
    self._testSingleClass(use_gpu=True)
    self._testSingleClass(use_gpu=False)

-  def testRankTooLarge(self):
-    np_features = np.array(
-        [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]).astype(np.float32)
-    np_labels = np.array([1, 2])
-    self.assertRaisesRegexp(
-        ValueError, "must have rank 2",
-        tf.nn.sparse_softmax_cross_entropy_with_logits, np_features, np_labels)
-
  def testNpXent(self):
    # We create 2 batches of logits for testing.
    # batch 0 is the boring uniform distribution: 1, 1, 1, 1, with target 3.
@ -104,7 +100,7 @@ class SparseXentTest(tf.test.TestCase):
    # With a hard 1, the backprop is [0.032 - 1.0 = -0.968, 0.087, 0.237, 0.644]
    # The loss for this batch is [1.0 * -log(0.25), 1.0 * -log(0.032)]
    # = [1.3862, 3.4420]
-    np_loss, np_backprop = self._npXent(np.array(features), np.array(labels))
+    np_loss, np_backprop, _ = self._npXent(np.array(features), np.array(labels))
    self.assertAllClose(np.array([[0.25, 0.25, 0.25, -0.75],
                                  [-0.968, 0.087, 0.237, 0.6439]]),
                        np_backprop,
@ -114,15 +110,21 @@ class SparseXentTest(tf.test.TestCase):

  def testShapeMismatch(self):
    with self.test_session():
-      with self.assertRaises(ValueError):
+      with self.assertRaisesRegexp(ValueError, ".*Rank mismatch:*"):
        tf.nn.sparse_softmax_cross_entropy_with_logits(
-            [[0., 1.], [2., 3.]], [[0, 2]])
+            [[0., 1.], [2., 3.], [2., 3.]], [[0, 2]])

-  def testNotMatrix(self):
+  def testScalar(self):
    with self.test_session():
-      with self.assertRaises(ValueError):
+      with self.assertRaisesRegexp(ValueError, ".*Logits cannot be scalars*"):
        tf.nn.sparse_softmax_cross_entropy_with_logits(
-            [0., 1., 2., 3.], [0, 2])
+            tf.constant(1.0), tf.constant(0))
+
+  def testVector(self):
+    with self.test_session():
+      loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+          tf.constant([1.0]), tf.constant(0))
+      self.assertAllClose(0.0, loss.eval())

  def testFloat(self):
    for label_dtype in np.int32, np.int64:
@ -155,6 +157,31 @@ class SparseXentTest(tf.test.TestCase):
    print("cross entropy gradient err = ", err)
    self.assertLess(err, 5e-8)

+  def _testHighDim(self, use_gpu, features, labels):
+    np_loss, np_backprop, _ = self._npXent(np.array(features), np.array(labels))
+    # manually reshape loss
+    np_loss = np.reshape(np_loss, np.array(labels).shape)
+    with self.test_session(use_gpu=use_gpu) as sess:
+      loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+          features, labels)
+      backprop = loss.op.inputs[0].op.outputs[1]
+      tf_loss, tf_backprop = sess.run([loss, backprop])
+    self.assertAllCloseAccordingToType(np_loss, tf_loss)
+    self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
+
+  def testHighDim(self):
+    features = [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]
+    labels = [[3], [0]]
+    self._testHighDim(True, features, labels)
+    self._testHighDim(False, features, labels)
+
+  def testHighDim2(self):
+    features = [[[1., 1., 1., 1.], [2., 2., 2., 2.]],
+                [[1., 2., 3., 4.], [5., 6., 7., 8.]]]
+    labels = [[3, 2], [0, 3]]
+    self._testHighDim(True, features, labels)
+    self._testHighDim(False, features, labels)
+

 def _sparse_vs_dense_xent_benchmark_dense(labels, logits):
  labels = tf.identity(labels)
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@ -126,6 +126,7 @@ def shape(input, name=None):
    else:
      return gen_array_ops.shape(input, name=name)

+
 def rank(input, name=None):
  """Returns the rank of a tensor.

@ -612,8 +613,8 @@ def sparse_mask(a, mask_indices, name=None):
  """Masks elements of `IndexedSlices`.

  Given an `IndexedSlices` instance `a`, returns another `IndexedSlices` that
-  contains a subset of the slices of `a`. Only the slices at indices specified
-  in `mask_indices` are returned.
+  contains a subset of the slices of `a`. Only the slices at indices not
+  specified in `mask_indices` are returned.

  This is useful when you need to extract a subset of slices in an
  `IndexedSlices` object.
@ -627,7 +628,7 @@ def sparse_mask(a, mask_indices, name=None):
  tf.shape(a.values) => [4, 10]

  # `b` will be the subset of `a` slices at its second and third indices, so
-  # we want to mask of its first and last indices (which are at absolute
+  # we want to mask its first and last indices (which are at absolute
  # indices 12, 45)
  b = tf.sparse_mask(a, [12, 45])

--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@ -440,30 +440,65 @@ def sparse_softmax_cross_entropy_with_logits(logits, labels, name=None):
  on `logits` internally for efficiency.  Do not call this op with the
  output of `softmax`, as it will produce incorrect results.

-  `logits` must have the shape `[batch_size, num_classes]`
-  and dtype `float32` or `float64`.
-
-  `labels` must have the shape `[batch_size]` and dtype `int32` or `int64`.
+  A common use case is to have logits of shape `[batch_size, num_classes]` and
+  labels of shape `[batch_size]`. But higher dimensions are supported.

  Args:
-    logits: Unscaled log probabilities.
-    labels: Each entry `labels[i]` must be an index in `[0, num_classes)`. Other
-      values will result in a loss of 0, but incorrect gradient computations.
+    logits: Unscaled log probabilities of rank `r` and shape
+      `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
+    labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
+      `int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
+      Other values will result in a loss of 0, but incorrect gradient
+      computations.
    name: A name for the operation (optional).

  Returns:
-    A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-    softmax cross entropy loss.
+    A `Tensor` of the same shape as `labels` and of the same type as `logits`
+    with the softmax cross entropy loss.
+
+  Raises:
+    ValueError: If logits are scalars (need to have rank >= 1) or if the rank
+      of the labels is not equal to the rank of the labels minus one.
  """
  # TODO(pcmurray) Raise an error when the label is not an index in
  # [0, num_classes). Note: This could break users who call this with bad
  # labels, but disregard the bad results.

-  # The second output tensor contains the gradients.  We use it in
-  # _CrossEntropyGrad() in nn_grad but not here.
-  cost, unused_backprop = gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
-      logits, labels, name=name)
-  return cost
+  # Reshape logits and labels to rank 2.
+  with ops.op_scope([labels, logits], name,
+                    "SparseSoftmaxCrossEntropyWithLogits"):
+    labels = ops.convert_to_tensor(labels)
+    logits = ops.convert_to_tensor(logits)
+
+    # Store label shape for result later.
+    labels_static_shape = labels.get_shape()
+    labels_shape = array_ops.shape(labels)
+    if logits.get_shape().ndims is not None and logits.get_shape().ndims == 0:
+      raise ValueError("Logits cannot be scalars - received shape %s.",
+                       logits.get_shape())
+    if logits.get_shape().ndims is not None and (
+        labels_static_shape.ndims is not None and
+        labels_static_shape.ndims != logits.get_shape().ndims - 1):
+      raise ValueError("Rank mismatch: Labels rank (received %s) should equal "
+                       "logits rank (received %s) - 1.",
+                       labels_static_shape.ndims, logits.get_shape().ndims)
+    # Check if no reshapes are required.
+    if logits.get_shape().ndims == 2:
+      cost, _ = gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
+          logits, labels, name=name)
+      return cost
+    # Reshape logits to 2 dim, labels to 1 dim.
+    num_classes = array_ops.gather(array_ops.shape(logits),
+                                   array_ops.rank(logits) - 1)
+    logits = array_ops.reshape(logits, [-1, num_classes])
+    labels = array_ops.reshape(labels, [-1])
+    # The second output tensor contains the gradients.  We use it in
+    # _CrossEntropyGrad() in nn_grad but not here.
+    cost, _ = gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
+        logits, labels, name=name)
+    cost = array_ops.reshape(cost, labels_shape)
+    cost.set_shape(labels_static_shape)
+    return cost


@ops.RegisterShape("SparseSoftmaxCrossEntropyWithLogits")
--- a/tensorflow/python/training/proximal_adagrad.py
+++ b/tensorflow/python/training/proximal_adagrad.py
@ -0,0 +1,101 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""ProximalAdagrad for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import constant_op
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class ProximalAdagradOptimizer(optimizer.Optimizer):
+  # pylint: disable=line-too-long
+  """Optimizer that implements the Proximal Adagrad algorithm.
+
+  See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
+
+  @@__init__
+  """
+
+  def __init__(self, learning_rate, initial_accumulator_value=0.1,
+               l1_regularization_strength=0.0, l2_regularization_strength=0.0,
+               use_locking=False, name="ProximalAdagrad"):
+    """Construct a new ProximalAdagrad optimizer.
+
+    Args:
+      learning_rate: A `Tensor` or a floating point value.  The learning rate.
+      initial_accumulator_value: A floating point value.
+        Starting value for the accumulators, must be positive.
+      l1_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      l2_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      use_locking: If `True` use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adagrad".
+
+    Raises:
+      ValueError: If the `initial_accumulator_value` is invalid.
+    """
+    if initial_accumulator_value <= 0.0:
+      raise ValueError("initial_accumulator_value must be positive: %s" %
+                       initial_accumulator_value)
+    super(ProximalAdagradOptimizer, self).__init__(use_locking, name)
+    self._learning_rate = learning_rate
+    self._initial_accumulator_value = initial_accumulator_value
+    self._l1_regularization_strength = l1_regularization_strength
+    self._l2_regularization_strength = l2_regularization_strength
+    # Created in Initialize.
+    self._l1_regularization_strength_tensor = None
+    self._l2_regularization_strength_tensor = None
+    self._learning_rate_tensor = None
+
+  def _create_slots(self, var_list):
+    for v in var_list:
+      with ops.colocate_with(v):
+        val = constant_op.constant(self._initial_accumulator_value,
+                                   shape=v.get_shape())
+      self._get_or_make_slot(v, val, "accumulator", self._name)
+
+  def _prepare(self):
+    self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
+                                                       name="learning_rate")
+    self._l1_regularization_strength_tensor = ops.convert_to_tensor(
+        self._l1_regularization_strength,
+        name="l1_regularization_strength")
+    self._l2_regularization_strength_tensor = ops.convert_to_tensor(
+        self._l2_regularization_strength,
+        name="l2_regularization_strength")
+
+  def _apply_dense(self, grad, var):
+    acc = self.get_slot(var, "accumulator")
+    return training_ops.apply_proximal_adagrad(
+        var, acc, self._learning_rate_tensor,
+        self._l1_regularization_strength_tensor,
+        self._l2_regularization_strength_tensor,
+        grad, use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var):
+    acc = self.get_slot(var, "accumulator")
+    return training_ops.sparse_apply_proximal_adagrad(
+        var, acc, self._learning_rate_tensor,
+        self._l1_regularization_strength_tensor,
+        self._l2_regularization_strength_tensor,
+        grad.values, grad.indices,
+        use_locking=self._use_locking)
--- a/tensorflow/python/training/proximal_adagrad_test.py
+++ b/tensorflow/python/training/proximal_adagrad_test.py
@ -0,0 +1,205 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functional tests for Proximal Adagrad operations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+
+class ProximalAdagradOptimizerTest(tf.test.TestCase):
+
+  def testProximalAdagradwithoutRegularization(self):
+    with self.test_session() as sess:
+      var0 = tf.Variable([0.0, 0.0])
+      var1 = tf.Variable([0.0, 0.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+      opt = tf.train.ProximalAdagradOptimizer(3.0,
+                                              initial_accumulator_value=0.1,
+                                              l1_regularization_strength=0.0,
+                                              l2_regularization_strength=0.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose([0.0, 0.0], v0_val)
+      self.assertAllClose([0.0, 0.0], v1_val)
+
+      # Run 3 steps Proximal Adagrad.
+      for _ in range(3):
+        update.run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose(np.array([-2.60260963, -4.29698515]),
+                          v0_val)
+      self.assertAllClose(np.array([-0.28432083, -0.56694895]),
+                          v1_val)
+
+  def testProximalAdagradwithoutRegularization2(self):
+    with self.test_session() as sess:
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([4.0, 3.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+
+      opt = tf.train.ProximalAdagradOptimizer(3.0,
+                                              initial_accumulator_value=0.1,
+                                              l1_regularization_strength=0.0,
+                                              l2_regularization_strength=0.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([4.0, 3.0], v1_val)
+
+      # Run 3 steps Proximal Adagrad.
+      for _ in range(3):
+        update.run()
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose(np.array([-1.60261, -2.296985]),
+                          v0_val)
+      self.assertAllClose(np.array([3.715679, 2.433051]),
+                          v1_val)
+
+  def testProximalAdagradWithL1(self):
+    with self.test_session() as sess:
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([4.0, 3.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+
+      opt = tf.train.ProximalAdagradOptimizer(3.0,
+                                              initial_accumulator_value=0.1,
+                                              l1_regularization_strength=0.001,
+                                              l2_regularization_strength=0.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([4.0, 3.0], v1_val)
+
+      # Run 10 steps Proximal Adagrad
+      for _ in range(10):
+        update.run()
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose(np.array([0.662907, 0.767398]),
+                          v0_val)
+      self.assertAllClose(np.array([2.959304, 1.029232]),
+                          v1_val)
+
+  def testProximalAdagradWithL1_L2(self):
+    with self.test_session() as sess:
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([4.0, 3.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+
+      opt = tf.train.ProximalAdagradOptimizer(3.0,
+                                              initial_accumulator_value=0.1,
+                                              l1_regularization_strength=0.001,
+                                              l2_regularization_strength=2.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([4.0, 3.0], v1_val)
+
+      # Run 10 steps Proximal Adagrad.
+      for _ in range(10):
+        update.run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose(np.array([0.043069, 0.080461]),
+                          v0_val)
+      self.assertAllClose(np.array([0.004069, 0.008578]),
+                          v1_val)
+
+  def applyOptimizer(self, opt, steps=5, is_sparse=False):
+    if is_sparse:
+      var0 = tf.Variable([[1.0], [2.0]])
+      var1 = tf.Variable([[3.0], [4.0]])
+      grads0 = tf.IndexedSlices(tf.constant([0.1], shape=[1, 1]),
+                                tf.constant([0]),
+                                tf.constant([2, 1]))
+      grads1 = tf.IndexedSlices(tf.constant([0.02], shape=[1, 1]),
+                                tf.constant([1]),
+                                tf.constant([2, 1]))
+    else:
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([3.0, 4.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+
+    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    tf.initialize_all_variables().run()
+
+    sess = tf.get_default_session()
+    v0_val, v1_val = sess.run([var0, var1])
+    if is_sparse:
+      self.assertAllClose([[1.0], [2.0]], v0_val)
+      self.assertAllClose([[3.0], [4.0]], v1_val)
+    else:
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([3.0, 4.0], v1_val)
+
+    # Run ProximalAdagrad for a few steps
+    for _ in range(steps):
+      update.run()
+
+    v0_val, v1_val = sess.run([var0, var1])
+    return v0_val, v1_val
+
+  def testEquivAdagradwithoutRegularization(self):
+    with self.test_session():
+      val0, val1 = self.applyOptimizer(
+          tf.train.ProximalAdagradOptimizer(3.0,
+                                            initial_accumulator_value=0.1,
+                                            l1_regularization_strength=0.0,
+                                            l2_regularization_strength=0.0))
+
+    with self.test_session():
+      val2, val3 = self.applyOptimizer(
+          tf.train.AdagradOptimizer(3.0, initial_accumulator_value=0.1))
+
+    self.assertAllClose(val0, val2)
+    self.assertAllClose(val1, val3)
+
+  def testEquivSparseAdagradwithoutRegularization(self):
+    with self.test_session():
+      val0, val1 = self.applyOptimizer(
+          tf.train.ProximalAdagradOptimizer(3.0,
+                                            initial_accumulator_value=0.1,
+                                            l1_regularization_strength=0.0,
+                                            l2_regularization_strength=0.0),
+          is_sparse=True)
+
+    with self.test_session():
+      val2, val3 = self.applyOptimizer(
+          tf.train.AdagradOptimizer(3.0, initial_accumulator_value=0.1),
+          is_sparse=True)
+
+    self.assertAllClose(val0, val2)
+    self.assertAllClose(val1, val3)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/tensorflow/python/training/proximal_gradient_descent.py
+++ b/tensorflow/python/training/proximal_gradient_descent.py
@ -0,0 +1,81 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""ProximalGradientDescent for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+# pylint: disable=unused-import
+from tensorflow.python.ops import math_ops
+# pylint: enable=unused-import
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class ProximalGradientDescentOptimizer(optimizer.Optimizer):
+  # pylint: disable=line-too-long
+  """Optimizer that implements the proximal gradient descent algorithm.
+
+  See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
+
+  @@__init__
+  """
+
+  def __init__(self, learning_rate, l1_regularization_strength=0.0,
+               l2_regularization_strength=0.0, use_locking=False,
+               name="ProximalGradientDescent"):
+    """Construct a new proximal gradient descent optimizer.
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning
+        rate to use.
+      l1_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      l2_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      use_locking: If True use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "GradientDescent".
+    """
+    super(ProximalGradientDescentOptimizer, self).__init__(use_locking, name)
+    self._learning_rate = learning_rate
+    self._l1_regularization_strength = l1_regularization_strength
+    self._l2_regularization_strength = l2_regularization_strength
+    self._l1_regularization_strength_tensor = None
+    self._l2_regularization_strength_tensor = None
+
+  def _apply_dense(self, grad, var):
+    return training_ops.apply_proximal_gradient_descent(
+        var,
+        self._learning_rate_tensor,
+        self._l1_regularization_strength_tensor,
+        self._l2_regularization_strength_tensor,
+        grad,
+        use_locking=self._use_locking).op
+
+  def _apply_sparse(self, grad, var):
+    delta = ops.IndexedSlices(grad.values * self._learning_rate_tensor,
+                              grad.indices, grad.dense_shape)
+    return var.scatter_sub(delta, use_locking=self._use_locking)
+
+  def _prepare(self):
+    self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
+                                                       name="learning_rate")
+    self._l1_regularization_strength_tensor = ops.convert_to_tensor(
+        self._l1_regularization_strength, name="l1_regularization_strength")
+    self._l2_regularization_strength_tensor = ops.convert_to_tensor(
+        self._l2_regularization_strength, name="l2_regularization_strength")
--- a/tensorflow/python/training/proximal_gradient_descent_test.py
+++ b/tensorflow/python/training/proximal_gradient_descent_test.py
@ -0,0 +1,178 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functional tests for Proximal Gradient Descent operations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+
+class ProximalGradientDescentOptimizerTest(tf.test.TestCase):
+
+  def testProximalGradientDescentwithoutRegularization(self):
+    with self.test_session() as sess:
+      var0 = tf.Variable([0.0, 0.0])
+      var1 = tf.Variable([0.0, 0.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+      opt = tf.train.ProximalGradientDescentOptimizer(
+          3.0,
+          l1_regularization_strength=0.0,
+          l2_regularization_strength=0.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose([0.0, 0.0], v0_val)
+      self.assertAllClose([0.0, 0.0], v1_val)
+
+      # Run 3 steps Proximal Gradient Descent.
+      for _ in range(3):
+        update.run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose(np.array([-0.9, -1.8]),
+                          v0_val)
+      self.assertAllClose(np.array([-0.09, -0.18]),
+                          v1_val)
+
+  def testProximalGradientDescentwithoutRegularization2(self):
+    with self.test_session() as sess:
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([4.0, 3.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+
+      opt = tf.train.ProximalGradientDescentOptimizer(
+          3.0,
+          l1_regularization_strength=0.0,
+          l2_regularization_strength=0.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([4.0, 3.0], v1_val)
+
+      # Run 3 steps Proximal Gradient Descent
+      for _ in range(3):
+        update.run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose(np.array([0.1, 0.2]),
+                          v0_val)
+      self.assertAllClose(np.array([3.91, 2.82]),
+                          v1_val)
+
+  def testProximalGradientDescentWithL1_L2(self):
+    with self.test_session() as sess:
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([4.0, 3.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+
+      opt = tf.train.ProximalGradientDescentOptimizer(
+          3.0,
+          l1_regularization_strength=0.001,
+          l2_regularization_strength=2.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([4.0, 3.0], v1_val)
+
+      # Run 10 steps Proximal Gradient Descent
+      for _ in range(10):
+        update.run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose(np.array([0.037125, 0.074625]),
+                          v0_val)
+      self.assertAllClose(np.array([0.003375, 0.007125]),
+                          v1_val)
+
+  def applyOptimizer(self, opt, steps=5, is_sparse=False):
+    if is_sparse:
+      var0 = tf.Variable([[1.0], [2.0]])
+      var1 = tf.Variable([[3.0], [4.0]])
+      grads0 = tf.IndexedSlices(tf.constant([0.1], shape=[1, 1]),
+                                tf.constant([0]),
+                                tf.constant([2, 1]))
+      grads1 = tf.IndexedSlices(tf.constant([0.02], shape=[1, 1]),
+                                tf.constant([1]),
+                                tf.constant([2, 1]))
+    else:
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([3.0, 4.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+
+    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    tf.initialize_all_variables().run()
+
+    sess = tf.get_default_session()
+    v0_val, v1_val = sess.run([var0, var1])
+    if is_sparse:
+      self.assertAllClose([[1.0], [2.0]], v0_val)
+      self.assertAllClose([[3.0], [4.0]], v1_val)
+    else:
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([3.0, 4.0], v1_val)
+
+    # Run ProximalAdagrad for a few steps
+    for _ in range(steps):
+      update.run()
+
+    v0_val, v1_val = sess.run([var0, var1])
+    return v0_val, v1_val
+
+  def testEquivSparseGradientDescentwithoutRegularizaion(self):
+    with self.test_session():
+      val0, val1 = self.applyOptimizer(
+          tf.train.ProximalGradientDescentOptimizer(
+              3.0,
+              l1_regularization_strength=0.0,
+              l2_regularization_strength=0.0),
+          is_sparse=True)
+
+    with self.test_session():
+      val2, val3 = self.applyOptimizer(
+          tf.train.GradientDescentOptimizer(3.0), is_sparse=True)
+
+    self.assertAllClose(val0, val2)
+    self.assertAllClose(val1, val3)
+
+  def testEquivGradientDescentwithoutRegularizaion(self):
+    with self.test_session():
+      val0, val1 = self.applyOptimizer(
+          tf.train.ProximalGradientDescentOptimizer(
+              3.0,
+              l1_regularization_strength=0.0,
+              l2_regularization_strength=0.0))
+
+    with self.test_session():
+      val2, val3 = self.applyOptimizer(
+          tf.train.GradientDescentOptimizer(3.0))
+
+    self.assertAllClose(val0, val2)
+    self.assertAllClose(val1, val3)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@ -151,6 +151,7 @@ from tensorflow.python.ops import state_ops

 from tensorflow.python.training.adadelta import AdadeltaOptimizer
 from tensorflow.python.training.adagrad import AdagradOptimizer
+from tensorflow.python.training.proximal_adagrad import ProximalAdagradOptimizer
 from tensorflow.python.training.adam import AdamOptimizer
 from tensorflow.python.training.ftrl import FtrlOptimizer
 from tensorflow.python.training.momentum import MomentumOptimizer
@ -158,6 +159,7 @@ from tensorflow.python.training.moving_averages import ExponentialMovingAverage
 from tensorflow.python.training.optimizer import Optimizer
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 from tensorflow.python.training.gradient_descent import GradientDescentOptimizer
+from tensorflow.python.training.proximal_gradient_descent import ProximalGradientDescentOptimizer
 from tensorflow.python.training.sync_replicas_optimizer import SyncReplicasOptimizer

 # Utility classes for training.
--- a/tensorflow/python/training/training_ops.py
+++ b/tensorflow/python/training/training_ops.py
@ -69,6 +69,17 @@ def _ApplyAdagradShape(op):
  grad_shape = op.inputs[3].get_shape().merge_with(accum_shape)
  return [grad_shape]

+@ops.RegisterShape("ApplyProximalAdagrad")
+def _ApplyProximalAdagradShape(op):
+  """Shape function for the ApplyProximalAdagrad op."""
+  var_shape = op.inputs[0].get_shape()
+  accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
+  _AssertInputIsScalar(op, 2)  # lr
+  _AssertInputIsScalar(op, 3)  # l1
+  _AssertInputIsScalar(op, 4)  # l2
+  grad_shape = op.inputs[5].get_shape().merge_with(accum_shape)
+  return [grad_shape]
+

@ops.RegisterShape("ApplyFtrl")
 def _ApplyFtrlShape(op):
@ -133,6 +144,32 @@ def _ApplyGradientDescentShape(op):
  delta_shape = op.inputs[2].get_shape().merge_with(var_shape)
  return [delta_shape]

+
+@ops.RegisterShape("ApplyProximalGradientDescent")
+def _ApplyProximalGradientDescentShape(op):
+  """Shape function for the ApplyProximalGradientDescent op."""
+  var_shape = op.inputs[0].get_shape()
+  _AssertInputIsScalar(op, 1)  # alpha
+  _AssertInputIsScalar(op, 2)  # l1
+  _AssertInputIsScalar(op, 3)  # l2
+  delta_shape = op.inputs[4].get_shape().merge_with(var_shape)
+  return [delta_shape]
+
+
+@ops.RegisterShape("SparseApplyProximalGradientDescent")
+def _SparseApplyProximalGradientDescentShape(op):
+  """Shape function for the SparseApplyGradientDescent op."""
+  var_shape = op.inputs[0].get_shape()
+  _AssertInputIsScalar(op, 1)  # lr
+  _AssertInputIsScalar(op, 2)  # l1
+  _AssertInputIsScalar(op, 3)  # l2
+  grad_shape = op.inputs[4].get_shape().merge_with(
+      tensor_shape.TensorShape([None]).concatenate(var_shape[1:]))
+  unused_indices_shape = op.inputs[5].get_shape().merge_with(
+      tensor_shape.vector(grad_shape[0]))
+  return [var_shape]
+
+
@ops.RegisterShape("SparseApplyAdadelta")
 def _SparseApplyAdadeltaShape(op):
   """Shape function for the SparseApplyAdadelta op."""
@ -148,6 +185,7 @@ def _SparseApplyAdadeltaShape(op):
       tensor_shape.vector(grad_shape[0]))
   return [accum_update_shape]

+
@ops.RegisterShape("SparseApplyAdagrad")
 def _SparseApplyAdagradShape(op):
  """Shape function for the SparseApplyAdagrad op."""
@ -161,6 +199,21 @@ def _SparseApplyAdagradShape(op):
  return [accum_shape]


+@ops.RegisterShape("SparseApplyProximalAdagrad")
+def _SparseApplyProximalAdagradShape(op):
+  """Shape function for the SparseApplyProximalAdagrad op."""
+  var_shape = op.inputs[0].get_shape()
+  accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
+  _AssertInputIsScalar(op, 2)  # lr
+  _AssertInputIsScalar(op, 3)  # l1
+  _AssertInputIsScalar(op, 4)  # l2
+  grad_shape = op.inputs[5].get_shape().merge_with(
+      tensor_shape.TensorShape([None]).concatenate(accum_shape[1:]))
+  unused_indices_shape = op.inputs[6].get_shape().merge_with(
+      tensor_shape.vector(grad_shape[0]))
+  return [accum_shape]
+
+
@ops.RegisterShape("SparseApplyFtrl")
 def _SparseApplyFtrlShape(op):
  """Shape function for the SparseApplyFtrl op."""
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@ -1876,6 +1876,40 @@ bool CudnnSupport::DoPoolForward(
  return true;
 }

+bool CudnnSupport::DoPoolForward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<Eigen::half>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    DeviceMemory<Eigen::half>* output_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                        AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
+    return false;
+  }
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_HALF};
+  ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
+  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
+  status = dynload::cudnnPoolingForward(
+      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
+      output_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to enqueue forward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
 bool CudnnSupport::DoPoolBackward(
    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
    const dnn::BatchDescriptor& input_dimensions,
@ -1914,6 +1948,43 @@ bool CudnnSupport::DoPoolBackward(
  return true;
 }

+bool CudnnSupport::DoPoolBackward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<Eigen::half>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    const DeviceMemory<Eigen::half>& output_data,
+    const DeviceMemory<Eigen::half>& input_diff_data,
+    DeviceMemory<Eigen::half>* output_diff_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                        AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
+    return false;
+  }
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_HALF};
+  ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
+  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
+  status = dynload::cudnnPoolingBackward(
+      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
+      input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
+      src_desc.handle(), output_diff_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
 bool CudnnSupport::DoNormalize(
    Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
    const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@ -201,6 +201,13 @@ class CudnnSupport : public dnn::DnnSupport {
                     const dnn::BatchDescriptor& output_dimensions,
                     DeviceMemory<float>* output_data) override;

+  bool DoPoolForward(Stream* stream,
+                     const dnn::PoolingDescriptor& pooling_dimensions,
+                     const dnn::BatchDescriptor& input_dimensions,
+                     const DeviceMemory<Eigen::half>& input_data,
+                     const dnn::BatchDescriptor& output_dimensions,
+                     DeviceMemory<Eigen::half>* output_data) override;
+
  bool DoPoolBackward(Stream* stream,
                      const dnn::PoolingDescriptor& pooling_dimensions,
                      const dnn::BatchDescriptor& input_dimensions,
@ -210,6 +217,15 @@ class CudnnSupport : public dnn::DnnSupport {
                      const DeviceMemory<float>& input_diff_data,
                      DeviceMemory<float>* output_diff_data) override;

+  bool DoPoolBackward(Stream* stream,
+                      const dnn::PoolingDescriptor& pooling_dimensions,
+                      const dnn::BatchDescriptor& input_dimensions,
+                      const DeviceMemory<Eigen::half>& input_data,
+                      const dnn::BatchDescriptor& output_dimensions,
+                      const DeviceMemory<Eigen::half>& output_data,
+                      const DeviceMemory<Eigen::half>& input_diff_data,
+                      DeviceMemory<Eigen::half>* output_diff_data) override;
+
  bool DoNormalize(Stream* stream,
                   const dnn::NormalizeDescriptor& normalize_descriptor,
                   const DeviceMemory<float>& input_data,
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@ -1011,6 +1011,13 @@ class DnnSupport {
                             const dnn::BatchDescriptor& output_dimensions,
                             DeviceMemory<float>* output_data) = 0;

+  virtual bool DoPoolForward(Stream* stream,
+                             const dnn::PoolingDescriptor& pooling_dimensions,
+                             const dnn::BatchDescriptor& input_dimensions,
+                             const DeviceMemory<Eigen::half>& input_data,
+                             const dnn::BatchDescriptor& output_dimensions,
+                             DeviceMemory<Eigen::half>* output_data) = 0;
+
  // Performs differentiation of the pooling operation.
  virtual bool DoPoolBackward(Stream* stream,
                              const dnn::PoolingDescriptor& pooling_dimensions,
@ -1021,6 +1028,15 @@ class DnnSupport {
                              const DeviceMemory<float>& input_diff_data,
                              DeviceMemory<float>* output_diff_data) = 0;

+  virtual bool DoPoolBackward(Stream* stream,
+                              const dnn::PoolingDescriptor& pooling_dimensions,
+                              const dnn::BatchDescriptor& input_dimensions,
+                              const DeviceMemory<Eigen::half>& input_data,
+                              const dnn::BatchDescriptor& output_dimensions,
+                              const DeviceMemory<Eigen::half>& output_data,
+                              const DeviceMemory<Eigen::half>& input_diff_data,
+                              DeviceMemory<Eigen::half>* output_diff_data) = 0;
+
  // Applies local response normalization to the values from
  // input_data and writes the result to output_data. See comments on
  // NormalizeDescriptor for a description of local response
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@ -909,6 +909,30 @@ Stream &Stream::ThenPoolForward(
  return *this;
 }

+Stream &Stream::ThenPoolForward(
+    const dnn::PoolingDescriptor &pooling_dimensions,
+    const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<Eigen::half> &input_data,
+    const dnn::BatchDescriptor &output_dimensions,
+    DeviceMemory<Eigen::half> *output_data) {
+  VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions),
+            PARAM(input_data), PARAM(output_dimensions), PARAM(output_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                    input_data, output_dimensions,
+                                    output_data));
+    } else {
+      SetError();
+      LOG(WARNING)
+          << "attempting to perform DNN operation using StreamExecutor "
+             "without DNN support";
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenPoolBackward(
    const dnn::PoolingDescriptor &pooling_dimensions,
    const dnn::BatchDescriptor &input_dimensions,
@ -936,6 +960,33 @@ Stream &Stream::ThenPoolBackward(
  return *this;
 }

+Stream &Stream::ThenPoolBackward(
+    const dnn::PoolingDescriptor &pooling_dimensions,
+    const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<Eigen::half> &input_data,
+    const dnn::BatchDescriptor &output_dimensions,
+    const DeviceMemory<Eigen::half> &output_data,
+    const DeviceMemory<Eigen::half> &input_diff_data,
+    DeviceMemory<Eigen::half> *output_diff_data) {
+  VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions),
+            PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
+            PARAM(input_diff_data), PARAM(output_diff_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
+                                     input_data, output_dimensions, output_data,
+                                     input_diff_data, output_diff_data));
+    } else {
+      SetError();
+      LOG(WARNING)
+          << "attempting to perform DNN operation using StreamExecutor "
+             "without DNN support";
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenNormalize(
    const dnn::NormalizeDescriptor &normalize_descriptor,
    const DeviceMemory<float> &input_data, DeviceMemory<float> *output_data) {
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@ -421,6 +421,12 @@ class Stream {
                          const dnn::BatchDescriptor &output_dimensions,
                          DeviceMemory<float> *output_data);

+  Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
+                          const dnn::BatchDescriptor &input_dimensions,
+                          const DeviceMemory<Eigen::half> &input_data,
+                          const dnn::BatchDescriptor &output_dimensions,
+                          DeviceMemory<Eigen::half> *output_data);
+
  Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
                           const dnn::BatchDescriptor &input_dimensions,
                           const DeviceMemory<float> &input_data,
@ -429,6 +435,14 @@ class Stream {
                           const DeviceMemory<float> &input_diff_data,
                           DeviceMemory<float> *output_diff_data);

+  Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
+                           const dnn::BatchDescriptor &input_dimensions,
+                           const DeviceMemory<Eigen::half> &input_data,
+                           const dnn::BatchDescriptor &output_dimensions,
+                           const DeviceMemory<Eigen::half> &output_data,
+                           const DeviceMemory<Eigen::half> &input_diff_data,
+                           DeviceMemory<Eigen::half> *output_diff_data);
+
  Stream &ThenNormalize(const dnn::NormalizeDescriptor &normalize_descriptor,
                        const DeviceMemory<float> &input_data,
                        DeviceMemory<float> *output_data);
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -6,8 +6,8 @@
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
  native.new_http_archive(
    name = "eigen_archive",
-    url = "https://bitbucket.org/eigen/eigen/get/d02e6a705c30.tar.gz",
-    sha256 = "532956172daa8aba87c750791ff89a5c38cdb07e2525afe17ecb4bef812d67cf",
+    url = "https://bitbucket.org/eigen/eigen/get/0c0b79ecd74c.tar.gz",
+    sha256 = "b4b5884b03bd4bae114d02b36e2435ad1504ed8e51431d16c876b6f6a365882b",
    build_file = path_prefix + "eigen.BUILD",
  )

--- a/third_party/eigen3/Eigen/Cholesky
+++ b/third_party/eigen3/Eigen/Cholesky
@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/Eigen/Cholesky"
+#include "eigen-eigen-0c0b79ecd74c/Eigen/Cholesky"
--- a/third_party/eigen3/Eigen/Core
+++ b/third_party/eigen3/Eigen/Core
@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/Eigen/Core"
+#include "eigen-eigen-0c0b79ecd74c/Eigen/Core"
--- a/third_party/eigen3/Eigen/Eigenvalues
+++ b/third_party/eigen3/Eigen/Eigenvalues
@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/Eigen/Eigenvalues"
+#include "eigen-eigen-0c0b79ecd74c/Eigen/Eigenvalues"
--- a/Show More
+++ b/Show More