diff --git a/eigen.BUILD b/eigen.BUILD
index 79bafe65b62..e32f3aab492 100644
--- a/eigen.BUILD
+++ b/eigen.BUILD
@@ -1,6 +1,6 @@
 package(default_visibility = ["//visibility:public"])
 
-archive_dir = "eigen-eigen-d02e6a705c30"
+archive_dir = "eigen-eigen-0c0b79ecd74c"
 
 cc_library(
     name = "eigen",
diff --git a/tensorflow/contrib/cmake/external/eigen.cmake b/tensorflow/contrib/cmake/external/eigen.cmake
index db409760faa..d3075ab9d23 100644
--- a/tensorflow/contrib/cmake/external/eigen.cmake
+++ b/tensorflow/contrib/cmake/external/eigen.cmake
@@ -7,7 +7,7 @@
 
 include (ExternalProject)
 
-set(eigen_archive_hash "d02e6a705c30")
+set(eigen_archive_hash "0c0b79ecd74c")
 
 set(eigen_INCLUDE_DIRS
     ${CMAKE_CURRENT_BINARY_DIR}
@@ -16,7 +16,7 @@ set(eigen_INCLUDE_DIRS
     ${tensorflow_source_dir}/third_party/eigen3
 )
 set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
-set(eigen_HASH SHA256=532956172daa8aba87c750791ff89a5c38cdb07e2525afe17ecb4bef812d67cf)
+set(eigen_HASH SHA256=b4b5884b03bd4bae114d02b36e2435ad1504ed8e51431d16c876b6f6a365882b)
 set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
 set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)
 
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index e5e2e88dbbf..59d3ea145d8 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -198,6 +198,30 @@ py_test(
     ],
 )
 
+py_test(
+    name = "experiment_test",
+    size = "small",
+    srcs = ["python/learn/tests/experiment_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_test(
+    name = "learn_runner_test",
+    size = "small",
+    srcs = ["python/learn/tests/learn_runner_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
 py_test(
     name = "tensor_signature_test",
     srcs = ["python/learn/estimators/tensor_signature_test.py"],
@@ -439,6 +463,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "stability_test",
+    size = "small",
+    srcs = ["python/learn/tests/stability_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
 py_binary(
     name = "inspect_checkpoint",
     srcs = [
diff --git a/tensorflow/contrib/learn/python/learn/__init__.py b/tensorflow/contrib/learn/python/learn/__init__.py
index f94553eec50..375d90960d7 100644
--- a/tensorflow/contrib/learn/python/learn/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/__init__.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import numpy as np
 
+# pylint: disable=wildcard-import
 from tensorflow.contrib.learn.python.learn import datasets
 from tensorflow.contrib.learn.python.learn import estimators
 from tensorflow.contrib.learn.python.learn import graph_actions
@@ -30,9 +31,9 @@ from tensorflow.contrib.learn.python.learn import monitors
 from tensorflow.contrib.learn.python.learn import ops
 from tensorflow.contrib.learn.python.learn import preprocessing
 from tensorflow.contrib.learn.python.learn import utils
-# pylint: disable=wildcard-import
 from tensorflow.contrib.learn.python.learn.dataframe import *
 from tensorflow.contrib.learn.python.learn.estimators import *
+from tensorflow.contrib.learn.python.learn.experiment import Experiment
 from tensorflow.contrib.learn.python.learn.graph_actions import evaluate
 from tensorflow.contrib.learn.python.learn.graph_actions import infer
 from tensorflow.contrib.learn.python.learn.graph_actions import NanLossDuringTrainingError
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index 9b2bbd7562a..5079d6a6296 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -50,11 +50,11 @@ class DNNClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
 
       def input_fn_eval: # returns x, Y
         pass
-      estimator.evaluate(input_fn_eval)
-      estimator.predict(x)
+      estimator.evaluate(input_fn=input_fn_eval)
+      estimator.predict(x=x)
       ```
 
-    Input of `fit`, `train`, and `evaluate` should have following features,
+    Input of `fit` and `evaluate` should have following features,
       otherwise there will be a `KeyError`:
         if `weight_column_name` is not `None`, a feature with
           `key=weight_column_name` whose value is a `Tensor`.
@@ -145,11 +145,11 @@ class DNNRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
 
       def input_fn_eval: # returns x, Y
         pass
-      estimator.evaluate(input_fn_eval)
-      estimator.predict(x)
+      estimator.evaluate(input_fn=input_fn_eval)
+      estimator.predict(x=x)
       ```
 
-    Input of `fit`, `train`, and `evaluate` should have following features,
+    Input of `fit` and `evaluate` should have following features,
       otherwise there will be a `KeyError`:
         if `weight_column_name` is not `None`, a feature with
           `key=weight_column_name` whose value is a `Tensor`.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
index 06e5e9d9df4..c7b33d527ac 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@@ -397,12 +397,12 @@ class DNNLinearCombinedClassifier(_DNNLinearCombinedBaseEstimator):
       ...
     def input_fn_eval: # returns x, y
       ...
-    estimator.train(input_fn_train)
-    estimator.evaluate(input_fn_eval)
-    estimator.predict(x)
+    estimator.fit(input_fn=input_fn_train)
+    estimator.evaluate(input_fn=input_fn_eval)
+    estimator.predict(x=x)
     ```
 
-    Input of `fit`, `train`, and `evaluate` should have following features,
+    Input of `fit` and `evaluate` should have following features,
       otherwise there will be a `KeyError`:
         if `weight_column_name` is not `None`, a feature with
           `key=weight_column_name` whose value is a `Tensor`.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
index d405e56bb05..407ca38f662 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
@@ -42,7 +42,14 @@ def _prepare_iris_data_for_logistic_regression():
   return iris
 
 
-def _iris_input_fn():
+def _iris_input_multiclass_fn():
+  iris = tf.contrib.learn.datasets.load_iris()
+  return {
+      'feature': tf.constant(iris.data, dtype=tf.float32)
+  }, tf.constant(iris.target, shape=[150, 1], dtype=tf.int32)
+
+
+def _iris_input_logistic_fn():
   iris = _prepare_iris_data_for_logistic_regression()
   return {
       'feature': tf.constant(iris.data, dtype=tf.float32)
@@ -64,8 +71,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase):
         dnn_feature_columns=cont_features,
         dnn_hidden_units=[3, 3])
 
-    classifier.fit(input_fn=_iris_input_fn, steps=100)
-    scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
+    classifier.fit(input_fn=_iris_input_logistic_fn, steps=100)
+    scores = classifier.evaluate(input_fn=_iris_input_logistic_fn, steps=100)
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testLogisticRegression_TensorData(self):
@@ -127,8 +134,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase):
         dnn_feature_columns=cont_features,
         dnn_hidden_units=[3, 3])
 
-    classifier.fit(input_fn=_iris_input_fn, steps=100)
-    scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
+    classifier.fit(input_fn=_iris_input_multiclass_fn, steps=100)
+    scores = classifier.evaluate(input_fn=_iris_input_multiclass_fn, steps=100)
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testWeightColumn(self):
@@ -210,8 +217,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase):
         dnn_hidden_units=[3, 3],
         dnn_optimizer=tf.train.AdagradOptimizer(learning_rate=0.1))
 
-    classifier.fit(input_fn=_iris_input_fn, steps=100)
-    scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
+    classifier.fit(input_fn=_iris_input_logistic_fn, steps=100)
+    scores = classifier.evaluate(input_fn=_iris_input_logistic_fn, steps=100)
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testCustomOptimizerByString(self):
@@ -230,8 +237,8 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase):
         dnn_hidden_units=[3, 3],
         dnn_optimizer='Adagrad')
 
-    classifier.fit(input_fn=_iris_input_fn, steps=100)
-    scores = classifier.evaluate(input_fn=_iris_input_fn, steps=100)
+    classifier.fit(input_fn=_iris_input_logistic_fn, steps=100)
+    scores = classifier.evaluate(input_fn=_iris_input_logistic_fn, steps=100)
     self.assertGreater(scores['accuracy'], 0.9)
 
   def testPredict(self):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index e6c2a30134b..20531ab691b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -111,8 +111,8 @@ class BaseEstimator(sklearn.BaseEstimator):
     self._model_dir = model_dir
     if self._model_dir is None:
       self._model_dir = tempfile.mkdtemp()
-      logging.info('Using temporary folder as model directory: %s',
-                   self._model_dir)
+      logging.warning('Using temporary folder as model directory: %s',
+                      self._model_dir)
 
     # Create a run configuration
     if config is None:
@@ -135,9 +135,8 @@ class BaseEstimator(sklearn.BaseEstimator):
 
     self._graph = None
 
-  def fit(
-      self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
-      monitors=None):
+  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
+          monitors=None):
     """Trains a model given training data `x` predictions and `y` targets.
 
     Args:
@@ -421,21 +420,20 @@ class BaseEstimator(sklearn.BaseEstimator):
                    monitors=None,
                    log_every_steps=100,
                    fail_on_nan_loss=True):
-    # TODO(wicke): This is a hack and needs to go.
-    if self._config.execution_mode not in ('all', 'train'):
-      return
+    # TODO(wicke): Remove this once Model and associated code are gone.
+    if hasattr(self._config, 'execution_mode'):
+      if self._config.execution_mode not in ('all', 'train'):
+        return
 
-    if not self._model_dir:
-      raise ValueError('Estimator\'s model_dir should be non-empty.')
-
-    # Stagger startup of worker sessions based on task id.
-    sleep_secs = min(self._config.training_worker_max_startup_secs,
-                     self._config.task *
-                     self._config.training_worker_session_startup_stagger_secs)
-    if sleep_secs:
-      logging.info('Waiting %d secs before starting task %d.', sleep_secs,
-                   self._config.task)
-      time.sleep(sleep_secs)
+      # Stagger startup of worker sessions based on task id.
+      sleep_secs = min(
+          self._config.training_worker_max_startup_secs,
+          self._config.task *
+          self._config.training_worker_session_startup_stagger_secs)
+      if sleep_secs:
+        logging.info('Waiting %d secs before starting task %d.', sleep_secs,
+                     self._config.task)
+        time.sleep(sleep_secs)
 
     # Device allocation
     device_fn = device_fn or self._device_fn
@@ -454,7 +452,7 @@ class BaseEstimator(sklearn.BaseEstimator):
       monitors += monitors_lib.get_default_monitors(
           loss_op=loss_op,
           summary_op=logging_ops.get_summary_op(),
-          save_summary_steps=100,
+          save_summary_steps=self._config.save_summary_steps,
           summary_writer=graph_actions.get_summary_writer(self._model_dir))
 
       is_chief = self._config.task == 0
@@ -478,8 +476,9 @@ class BaseEstimator(sklearn.BaseEstimator):
           log_every_steps=log_every_steps,
           supervisor_is_chief=is_chief,
           supervisor_master=self._config.master,
+          supervisor_save_model_secs=self._config.save_checkpoints_secs,
           feed_fn=feed_fn,
-          max_steps=steps,
+          steps=steps,
           fail_on_nan_loss=fail_on_nan_loss,
           monitors=monitors)
 
@@ -513,8 +512,9 @@ class BaseEstimator(sklearn.BaseEstimator):
                       feed_fn=None,
                       metrics=None,
                       name=''):
-    # TODO(wicke): This is a hack and needs to go.
-    if self._config.execution_mode not in ('all', 'evaluate', 'eval_evalset'):
+    # TODO(wicke): Remove this once Model and associated code are gone.
+    if (hasattr(self._config, 'execution_mode') and
+        self._config.execution_mode not in ('all', 'evaluate', 'eval_evalset')):
       return
 
     # Check that model has been trained.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index 515134be932..4884b1290e5 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -46,10 +46,10 @@ class LinearClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
       ...
     estimator.fit(input_fn=input_fn_train)
     estimator.evaluate(input_fn=input_fn_eval)
-    estimator.predict(x)
+    estimator.predict(x=x)
     ```
 
-    Input of `fit`, `train`, and `evaluate` should have following features,
+    Input of `fit` and `evaluate` should have following features,
       otherwise there will be a `KeyError`:
         if `weight_column_name` is not `None`, a feature with
           `key=weight_column_name` whose value is a `Tensor`.
@@ -126,10 +126,10 @@ class LinearRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
       ...
     estimator.fit(input_fn=input_fn_train)
     estimator.evaluate(input_fn=input_fn_eval)
-    estimator.predict(x)
+    estimator.predict(x=x)
     ```
 
-    Input of `fit`, `train`, and `evaluate` should have following features,
+    Input of `fit` and `evaluate` should have following features,
       otherwise there will be a KeyError:
         if `weight_column_name` is not None:
           key=weight_column_name, value=a `Tensor`
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 19d37d26b08..bfcf0d3e1f5 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -24,79 +24,54 @@ from tensorflow.python import GPUOptions
 
 
 class RunConfig(object):
-  """This class specifies the specific configurations for the run.
+  """This class specifies the specific configurations for the run."""
 
-  Parameters:
-    execution_mode: Runners use this flag to execute different tasks, like
-      training vs evaluation. 'all' (the default) executes both training and
-      eval.
-    master: TensorFlow master. Empty string (the default) for local.
-    task: Task id of the replica running the training (default: 0).
-    num_ps_replicas: Number of parameter server tasks to use (default: 0).
-    training_worker_session_startup_stagger_secs: Seconds to sleep between the
-      startup of each worker task session (default: 5).
-    training_worker_max_startup_secs: Max seconds to wait before starting any
-      worker (default: 60).
-    eval_delay_secs: Number of seconds between the beginning of each eval run.
-      If one run takes more than this amount of time, the next run will start
-      immediately once that run completes (default 60).
-    eval_steps: Number of steps to run in each eval (default: 100).
-    num_cores: Number of cores to be used (default: 4).
-    verbose: Controls the verbosity, possible values:
-      0: the algorithm and debug information is muted.
-      1: trainer prints the progress.
-      2: log device placement is printed.
-    gpu_memory_fraction: Fraction of GPU memory used by the process on
-      each GPU uniformly on the same machine.
-    tf_random_seed: Random seed for TensorFlow initializers.
-      Setting this value allows consistency between reruns.
-    keep_checkpoint_max: The maximum number of recent checkpoint files to keep.
-      As new files are created, older files are deleted.
-      If None or 0, all checkpoint files are kept.
-      Defaults to 5 (that is, the 5 most recent checkpoint files are kept.)
-    keep_checkpoint_every_n_hours: Number of hours between each checkpoint
-      to be saved. The default value of 10,000 hours effectively disables
-      the feature.
-
-  Attributes:
-    tf_master: Tensorflow master.
-    tf_config: Tensorflow Session Config proto.
-    tf_random_seed: Tensorflow random seed.
-    keep_checkpoint_max: Maximum number of checkpoints to keep.
-    keep_checkpoint_every_n_hours: Number of hours between each checkpoint.
-  """
-
-  # TODO(wicke): Cull unused options.
+  # TODO(wicke): Move options out once functionality is covered by monitors
   def __init__(self,
-               execution_mode='all',
                master='',
                task=0,
                num_ps_replicas=0,
-               training_worker_session_startup_stagger_secs=5,
-               training_worker_max_startup_secs=60,
-               eval_delay_secs=60,
-               eval_steps=100,
                num_cores=4,
-               verbose=1,
+               log_device_placement=False,
                gpu_memory_fraction=1,
                tf_random_seed=42,
+               save_summary_steps=100,
+               save_checkpoints_secs=60,
                keep_checkpoint_max=5,
                keep_checkpoint_every_n_hours=10000):
-    self.execution_mode = execution_mode
+    """Constructor.
+
+    Args:
+      master: TensorFlow master. Empty string (the default) for local.
+      task: Task id of the replica running the training (default: 0).
+      num_ps_replicas: Number of parameter server tasks to use (default: 0).
+      num_cores: Number of cores to be used (default: 4).
+      log_device_placement: Log the op placement to devices (default: False).
+      gpu_memory_fraction: Fraction of GPU memory used by the process on
+        each GPU uniformly on the same machine.
+      tf_random_seed: Random seed for TensorFlow initializers.
+        Setting this value allows consistency between reruns.
+      save_summary_steps: Save summaries every this many steps.
+      save_checkpoints_secs: Save checkpoints every this many seconds.
+      keep_checkpoint_max: The maximum number of recent checkpoint files to
+        keep. As new files are created, older files are deleted. If None or 0,
+        all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
+        checkpoint files are kept.)
+      keep_checkpoint_every_n_hours: Number of hours between each checkpoint
+        to be saved. The default value of 10,000 hours effectively disables
+        the feature.
+    """
     self.master = master
     self.task = task
     self.num_ps_replicas = num_ps_replicas
-    self.training_worker_session_startup_stagger_secs = (
-        training_worker_session_startup_stagger_secs)
-    self.training_worker_max_startup_secs = training_worker_max_startup_secs
-    self.eval_delay_secs = eval_delay_secs
-    self.eval_steps = eval_steps
     gpu_options = GPUOptions(
         per_process_gpu_memory_fraction=gpu_memory_fraction)
-    self.tf_config = ConfigProto(log_device_placement=(verbose > 1),
+    self.tf_config = ConfigProto(log_device_placement=log_device_placement,
                                  inter_op_parallelism_threads=num_cores,
                                  intra_op_parallelism_threads=num_cores,
                                  gpu_options=gpu_options)
     self.tf_random_seed = tf_random_seed
+    self.save_summary_steps = save_summary_steps
+    self.save_checkpoints_secs = save_checkpoints_secs
     self.keep_checkpoint_max = keep_checkpoint_max
     self.keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
new file mode 100644
index 00000000000..045dd730550
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -0,0 +1,134 @@
+#  Copyright 2016 Google Inc. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Experiment class collecting information needed for a single training run."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from tensorflow.python.platform import tf_logging as logging
+
+
+class Experiment(object):
+  """Experiment is a class containing all information needed to train a model.
+  """
+
+  def __init__(self, estimator,
+               train_input_fn,
+               eval_input_fn,
+               eval_metrics=None):
+    """Constructor for Experiment.
+
+    Args:
+      estimator: `Estimator` object.
+      train_input_fn: function, returns features and targets for training.
+      eval_input_fn: function, returns features and targets for evaluation.
+      eval_metrics: `dict` of string, metric function. If `None`, default set
+        is used.
+    """
+    super(Experiment, self).__init__()
+    self._estimator = estimator
+    self._train_input_fn = train_input_fn
+    self._eval_input_fn = eval_input_fn
+    self._eval_metrics = eval_metrics
+
+  def train(self, steps=None, monitors=None, delay_secs=0):
+    """Fit the estimator using the training data.
+
+    Train the estimator for `steps` steps, after waiting for `delay_secs`
+    seconds. If `steps` is `None`, train forever.
+
+    Args:
+      steps: Perform this many steps of training. `None`, the default, means
+        train forever.
+      monitors: A list of monitors to pass to the `Estimator`'s `fit` function.
+      delay_secs: Start training after this many seconds.
+
+    Returns:
+      The trained estimator.
+    """
+
+    if delay_secs:
+      logging.info("Waiting %d secs before starting training.", delay_secs)
+      time.sleep(delay_secs)
+
+    return self._estimator.fit(input_fn=self._train_input_fn,
+                               steps=steps, monitors=monitors)
+
+  def evaluate(self, steps=None, delay_secs=0):
+    """Evaluate on the evaluation data.
+
+    Runs evaluation on the evaluation data and returns the result. If `steps`
+    is given, only run for this many steps. Start the evaluation after
+    `delay_secs` seconds.
+
+    Args:
+      steps: Run this many steps of evaluation.
+      delay_secs: Start evaluating after waiting for this many seconds.
+
+    Returns:
+      The result of the `evaluate` call to the `Estimator`.
+    """
+
+    if delay_secs:
+      logging.info("Waiting %d secs before starting eval.", delay_secs)
+      time.sleep(delay_secs)
+
+    return self._estimator.evaluate(input_fn=self._eval_input_fn,
+                                    steps=steps,
+                                    metrics=self._eval_metrics)
+
+  def _continuous_eval(self, input_fn, steps=1000, delay_secs=0,
+                       throttle_delay_secs=60):
+    """Run continuous eval on the eval data.
+
+    Run `steps` steps of evaluation on the evaluation data set. This function
+    starts evaluating after `delay_secs` seconds and then runs no more than one
+    evaluation per `throttle_delay_secs`. It never returns.
+
+    Args:
+      input_fn: The input to use for this eval.
+      steps: Number of steps per evaluation run.
+      delay_secs: Start evaluating after this many seconds.
+      throttle_delay_secs: Do not re-evaluate unless the last evaluation was
+        started at least this many seconds ago.
+    """
+    if delay_secs:
+      logging.info("Waiting %f secs before starting eval.", delay_secs)
+      time.sleep(delay_secs)
+
+    while True:
+      start = time.time()
+      self._estimator.evaluate(input_fn=input_fn,
+                               steps=steps,
+                               metrics=self._eval_metrics)
+      duration = time.time() - start
+      if duration < throttle_delay_secs:
+        difference = throttle_delay_secs - duration
+        logging.info("Waiting %f secs before starting next eval run.",
+                     difference)
+        time.sleep(difference)
+
+  def continuous_eval(self, steps=1000, delay_secs=0, throttle_delay_secs=60):
+    self._continuous_eval(self._eval_input_fn, steps=steps,
+                          delay_secs=delay_secs,
+                          throttle_delay_secs=throttle_delay_secs)
+
+  def continuous_eval_on_train_data(self, steps=1000, delay_secs=0,
+                                    throttle_delay_secs=60):
+    self._continuous_eval(self._train_input_fn, steps=steps,
+                          delay_secs=delay_secs,
+                          throttle_delay_secs=throttle_delay_secs)
diff --git a/tensorflow/contrib/learn/python/learn/graph_actions.py b/tensorflow/contrib/learn/python/learn/graph_actions.py
index ef57d7ce360..d96f99efa29 100644
--- a/tensorflow/contrib/learn/python/learn/graph_actions.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions.py
@@ -30,8 +30,9 @@ from six import reraise
 
 from tensorflow.contrib.framework.python.ops import ops as contrib_ops
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
-from tensorflow.contrib.layers.python.layers import summaries
 from tensorflow.contrib.learn.python.learn import monitors as monitors_lib
+from tensorflow.contrib.learn.python.learn.utils import checkpoints
+from tensorflow.core.framework import summary_pb2
 from tensorflow.python.client import session as tf_session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -131,7 +132,7 @@ def train(graph,
           supervisor_save_model_secs=600,
           supervisor_save_summaries_steps=100,
           feed_fn=None,
-          max_steps=None,
+          steps=None,
           fail_on_nan_loss=True,
           monitors=None):
   """Train a model.
@@ -173,7 +174,7 @@ def train(graph,
       `supervisor_save_summaries_steps` seconds when training.
     feed_fn: A function that is called every iteration to produce a `feed_dict`
       passed to `session.run` calls. Optional.
-    max_steps: Train until `global_step_tensor` evaluates to this value.
+    steps: Trains for this many steps (e.g. current global step + `steps`).
     fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op`
       evaluates to `NaN`. If false, continue training as if nothing happened.
     monitors: List of `BaseMonitor` subclass instances. Used for callbacks
@@ -192,28 +193,36 @@ def train(graph,
   if not output_dir:
     raise ValueError('Output directory should be non-empty.')
 
-  global_step_tensor = contrib_variables.assert_or_get_global_step(
-      graph, global_step_tensor)
-  if global_step_tensor is None:
-    raise ValueError('No "global_step" was provided or found in the graph.')
+  with graph.as_default():
+    global_step_tensor = contrib_variables.assert_or_get_global_step(
+        graph, global_step_tensor)
+    if global_step_tensor is None:
+      raise ValueError('No "global_step" was provided or found in the graph.')
 
-  summary_writer = (get_summary_writer(output_dir)
-                    if supervisor_is_chief else None)
+    # Get current step.
+    try:
+      start_step = checkpoints.load_variable(
+          output_dir, global_step_tensor.name)
+    except (errors.NotFoundError, ValueError):
+      start_step = 0
 
-  # TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors.
-  if not supervisor_is_chief:
-    # monitors should run only on the chief.
-    monitors = []
-  elif not monitors:
-    monitors = monitors_lib.get_default_monitors(
-        loss_op=loss_op,
-        summary_op=logging_ops.get_summary_op(),
-        save_summary_steps=supervisor_save_summaries_steps,
-        summary_writer=summary_writer)
+    summary_writer = (get_summary_writer(output_dir)
+                      if supervisor_is_chief else None)
 
-  # Start monitors, can create graph parts.
-  for monitor in monitors:
-    monitor.begin(max_steps=max_steps)
+    # TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors.
+    if not supervisor_is_chief:
+      # monitors should run only on the chief.
+      monitors = []
+    elif not monitors:
+      monitors = monitors_lib.get_default_monitors(
+          loss_op=loss_op,
+          summary_op=logging_ops.get_summary_op(),
+          save_summary_steps=supervisor_save_summaries_steps,
+          summary_writer=summary_writer)
+
+    # Start monitors, can create graph parts.
+    for monitor in monitors:
+      monitor.begin(max_steps=start_step + steps)
 
   supervisor = tf_supervisor.Supervisor(
       graph,
@@ -235,6 +244,7 @@ def train(graph,
     get_current_step = lambda: session.run(global_step_tensor)
 
     start_step = get_current_step()
+    max_steps = start_step + steps
     last_step = start_step
     last_log_step = start_step
     loss_value = None
@@ -375,6 +385,28 @@ def _start_queue_runners(session, coord):
   return threads
 
 
+def _eval_results_to_str(eval_results):
+  return ', '.join('%s = %s' % (k, v) for k, v in eval_results.items())
+
+
+def _write_summary_results(output_dir, eval_results, current_global_step):
+  """Writes eval results into summary file in given dir."""
+  logging.info('Saving evaluation summary for %d step: %s' % (
+      current_global_step, _eval_results_to_str(eval_results)))
+  summary_writer = get_summary_writer(output_dir)
+  summary = summary_pb2.Summary()
+  for key in eval_results:
+    if eval_results[key] is None:
+      continue
+    value = summary.value.add()
+    value.tag = key
+    if (isinstance(eval_results[key], np.float32) or
+        isinstance(eval_results[key], float)):
+      value.simple_value = float(eval_results[key])
+  summary_writer.add_summary(summary, current_global_step)
+  summary_writer.close()
+
+
 # TODO(ptucker): Add unit test.
 def evaluate(graph,
              output_dir,
@@ -424,32 +456,26 @@ def evaluate(graph,
       eval steps were run.
     global_step: The global step this evaluation corresponds to.
   """
-  global_step_tensor = contrib_variables.assert_or_get_global_step(
-      graph, global_step_tensor)
+  with graph.as_default():
+    global_step_tensor = contrib_variables.assert_or_get_global_step(
+        graph, global_step_tensor)
 
-  for key, value in eval_dict.items():
-    if not summaries.is_summary_tag_unique(key):
-      continue
-    if isinstance(value, ops.Tensor):
-      summaries.summarize_tensor(value, tag=key)
+    # Create or get summary op, global_step and saver.
+    saver = _get_saver()
+    local_init_op = _get_local_init_op()
+    ready_op = _get_ready_op()
 
-  # Create or get summary op, global_step and saver.
-  summary_op = logging_ops.get_summary_op()
-  saver = _get_saver()
-  local_init_op = _get_local_init_op()
-  ready_op = _get_ready_op()
+    session_manager = session_manager_lib.SessionManager(
+        local_init_op=local_init_op,
+        ready_op=ready_op)
+    session, initialized = session_manager.recover_session(
+        master=supervisor_master,
+        saver=saver,
+        checkpoint_dir=checkpoint_path)
 
-  session_manager = session_manager_lib.SessionManager(
-      local_init_op=local_init_op,
-      ready_op=ready_op)
-  session, initialized = session_manager.recover_session(
-      master=supervisor_master,
-      saver=saver,
-      checkpoint_dir=checkpoint_path)
-
-  # Start queue runners.
-  coord = coordinator.Coordinator()
-  threads = _start_queue_runners(session, coord)
+    # Start queue runners.
+    coord = coordinator.Coordinator()
+    threads = _start_queue_runners(session, coord)
 
   with session:
     if not initialized:
@@ -488,8 +514,7 @@ def evaluate(graph,
             duration = time.time() - start_time
             logging.info('Results after %d steps (%.3f sec/batch): %s.',
                          step, float(duration),
-                         ', '.join('%s = %s' % (k, v)
-                                   for k, v in eval_results.items()))
+                         _eval_results_to_str(eval_results))
       finally:
         if eval_results is None or step != eval_step:
           eval_results = session.run(eval_dict, feed_dict=feed_dict)
@@ -498,20 +523,6 @@ def evaluate(graph,
         coord.request_stop()
         coord.join(threads, stop_grace_period_secs=120)
 
-        # Make our own summary writer and write a summary to the eval dir.
-        # Only is feed_fn is not provided.
-        # TODO(ipolosukhin): Convert evaluation to use streaming_metrics,
-        # then we can save for non feed_fn as well.
-        if summary_op is not None and feed_fn is None:
-          summary_writer = None
-          try:
-            summary_writer = get_summary_writer(output_dir)
-            summary_str = session.run(summary_op)
-            if summary_str:
-              summary_writer.add_summary(summary_str, current_global_step)
-          finally:
-            if summary_writer:
-              summary_writer.close()
     # catch OutOfRangeError which is thrown when queue is out of data (and for
     # other reasons as well).
     except errors.OutOfRangeError as e:
@@ -526,6 +537,9 @@ def evaluate(graph,
       else:
         logging.warn('Input iterator is exhausted: %s.', e)
 
+  # Save summaries for this evaluation.
+  _write_summary_results(output_dir, eval_results, current_global_step)
+
   return eval_results, current_global_step
 
 
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner.py b/tensorflow/contrib/learn/python/learn/learn_runner.py
new file mode 100644
index 00000000000..97c30d57466
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/learn_runner.py
@@ -0,0 +1,75 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs an Experiment."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.experiment import Experiment
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import tf_logging as logging
+
+
+FLAGS = flags.FLAGS
+
+
+flags.DEFINE_string('schedule', '', 'Schedule to run for this experiment. '
+                    'A schedule identifies a method on the Experiment '
+                    'instance returned by the function passed to the '
+                    'run() call')
+flags.DEFINE_string('output_dir', '', 'Base output directory. Made '
+                    'available to the experiment builder function passed '
+                    'to run(). All files written by the Experiment are '
+                    'expected to be written into this directory.')
+
+
+def run(experiment_fn):
+  """Make and run an experiment."""
+
+  if not FLAGS.output_dir:
+    raise RuntimeError('Must specify an output directory (use --output_dir).')
+  if not FLAGS.schedule:
+    raise RuntimeError('Must specify a schedule (use --schedule).')
+
+  if not callable(experiment_fn):
+    raise TypeError('Experiment builder "%s" is not callable.' %
+                    experiment_fn)
+
+  # Call the builder
+  experiment = experiment_fn(output_dir=FLAGS.output_dir)
+  if not isinstance(experiment, Experiment):
+    raise TypeError('Experiment builder did not return an Experiment '
+                    'instance, got %s instead.' % type(experiment))
+
+  # Execute the schedule
+  taskname = FLAGS.schedule
+  if not hasattr(experiment, taskname):
+    logging.error('Schedule references non-existent task %s', taskname)
+    valid_tasks = [x for x in experiment.__dict__
+                   if callable(getattr(experiment, x))]
+    logging.error('Allowed values for this experiment are: %s', valid_tasks)
+    raise ValueError('Schedule references non-existent task %s', taskname)
+
+  task = getattr(experiment, taskname)
+  if not callable(task):
+    logging.error('Schedule references non-callable member %s', taskname)
+    valid_tasks = [x for x in experiment.__dict__
+                   if callable(getattr(experiment, x))]
+    logging.error('Allowed values for this experiment are: %s', valid_tasks)
+    raise TypeError('Schedule references non-callable member %s', taskname)
+
+  return task()
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index 066843faeff..f2ce5b0ceb2 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -19,7 +19,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+import six
+
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import saver
 from tensorflow.python.training import summary_io
 
 
@@ -135,17 +140,27 @@ class PrintTensor(EveryN):
   """
 
   def __init__(self, tensor_names, every_n=100, first_n=1):
+    """Initializes PrintTensor monitor.
+
+    Args:
+      tensor_names: `dict` of tag to tensor names or
+          `iterable` of tensor names (strings).
+      every_n: Print every N steps.
+      first_n: Print first N steps.
+    """
     super(PrintTensor, self).__init__(every_n, first_n)
+    if not isinstance(tensor_names, dict):
+      tensor_names = {item: item for item in tensor_names}
     self._tensor_names = tensor_names
 
   def every_n_step_begin(self, unused_step):
-    return self._tensor_names
+    return list(self._tensor_names.values())
 
   def every_n_step_end(self, step, outputs):
     stats = []
-    for name in self._tensor_names:
-      if name in outputs:
-        stats.append("%s = %s" % (name, str(outputs[name])))
+    for tag, tensor_name in six.iteritems(self._tensor_names):
+      if tensor_name in outputs:
+        stats.append("%s = %s" % (tag, str(outputs[tensor_name])))
     logging.info("Step %d: %s" % (step, ", ".join(stats)))
 
 
@@ -179,14 +194,45 @@ class SummarySaver(EveryN):
 
 
 class ValidationMonitor(EveryN):
-  """Runs evaluation every n steps.
-
-  Can do early stopping on validation loss if `early_stopping_rounds` provided.
+  """Runs evaluation of the Estimator every n steps.
 
+  Can do early stopping on validation metrics if
+  `early_stopping_rounds` provided.
   """
 
-  def __init__(self, x=None, y=None, input_fn=None,
-               every_n_steps=100, early_stopping_rounds=None):
+  def __init__(self, x=None, y=None, input_fn=None, batch_size=None,
+               every_n_steps=100, metrics=None, early_stopping_rounds=None,
+               early_stopping_metric="loss",
+               early_stopping_metric_minimize=True, name=None):
+    """Initializes ValidationMonitor.
+
+    Args:
+      x: matrix or tensor of shape [n_samples, n_features...]. Can be
+         iterator that returns arrays of features. The training input
+         samples for fitting the model. If set, `input_fn` must be `None`.
+      y: vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
+         iterator that returns array of targets. The training target values
+         (class labels in classification, real numbers in regression). If set,
+         `input_fn` must be `None`.
+      input_fn: Input function. If set, `x`, `y`, and `batch_size` must be
+          `None`.
+      batch_size: minibatch size to use on the input, defaults to first
+          dimension of `x`. Must be `None` if `input_fn` is provided.
+      every_n_steps: Runs this monitor every N steps.
+      metrics: Dict of metric ops to run. If None, the default metric functions
+        are used; if {}, no metrics are used.
+      early_stopping_rounds: If validation metric didn't go down for this many
+          steps, then stop training.
+      early_stopping_metric: `str`, name of the metric to early stop.
+      early_stopping_metric_minimize: `bool`, True if minimize, False
+          if maximize. For example, minimize `loss` or `mean_squared_error` and
+          maximize `accuracy` or `f1`.
+      name: `str`, appended to output sub-folder. If None uses `eval`
+          sub-folder, else, `eval-%name%` is used to save sum.
+
+    Raises:
+      ValueError: If both x and input_fn are provided.
+    """
     super(ValidationMonitor, self).__init__(every_n_steps=every_n_steps,
                                             first_n_steps=-1)
     if x is None and input_fn is None:
@@ -194,25 +240,64 @@ class ValidationMonitor(EveryN):
     self.x = x
     self.y = y
     self.input_fn = input_fn
-    self.min_loss_step = 0
-    self.min_loss = None
+    self.batch_size = batch_size
+    self.metrics = metrics
     self.early_stopping_rounds = early_stopping_rounds
+    self.early_stopping_metric = early_stopping_metric
+    self.early_stopping_metric_minimize = early_stopping_metric_minimize
+    self.name = name
+    self._best_value_step = None
+    self._best_value = None
+    self._early_stopped = False
+    self._latest_path = None
+    self._latest_path_step = None
+
+  @property
+  def early_stopped(self):
+    return self._early_stopped
+
+  @property
+  def best_step(self):
+    return self._best_value_step
+
+  @property
+  def best_value(self):
+    return self._best_value
 
   def every_n_step_end(self, step, unused_outputs):
+    # Check that we are not running evaluation on the same checkpoint.
+    latest_path = saver.latest_checkpoint(self._estimator.model_dir)
+    if latest_path == self._latest_path:
+      logging.info("Skipping evaluation due to same checkpoint %s for step %d "
+                   "as for step %d.", latest_path, step, self._latest_path_step)
+      return False
+    self._latest_path = latest_path
+    self._latest_path_step = step
+
+    # Run evaluation and log it.
     outputs = self._estimator.evaluate(
-        x=self.x, y=self.y, input_fn=self.input_fn)
+        x=self.x, y=self.y, input_fn=self.input_fn, batch_size=self.batch_size,
+        metrics=self.metrics, name=self.name)
     stats = []
     for name in outputs:
       stats.append("%s = %s" % (name, str(outputs[name])))
     logging.info("Validation (step %d): %s" % (step, ", ".join(stats)))
+
+    # Early stopping logic.
     if self.early_stopping_rounds is not None:
-      if self.min_loss is None or outputs["loss"] < self.min_loss:
-        self.min_loss = outputs["loss"]
-        self.min_loss_step = step
-      stop_now = (step - self.min_loss_step >= self.early_stopping_rounds)
+      if (self._best_value is None or
+          (self.early_stopping_metric_minimize and
+           outputs[self.early_stopping_metric] < self._best_value) or
+          (not self.early_stopping_metric_minimize and
+           outputs[self.early_stopping_metric] > self._best_value)):
+        self._best_value = outputs[self.early_stopping_metric]
+        self._best_value_step = step
+      stop_now = (step - self._best_value_step >= self.early_stopping_rounds)
       if stop_now:
-        logging.info("Stopping. Best step: {} with loss {}."
-                     .format(self.min_loss_step, self.min_loss))
+        logging.info("Stopping. Best step: {} with {} = {}."
+                     .format(self._best_value_step,
+                             self.early_stopping_metric, self._best_value))
+        self._early_stopped = True
         return True
     return False
 
@@ -220,7 +305,7 @@ class ValidationMonitor(EveryN):
 class CaptureVariable(EveryN):
   """Capture a variable value into a `list`.
 
-  It's useful for unit testing.
+  This monitor is useful for unit testing.
   """
 
   def __init__(self, var_name, every_n=100, first_n=1):
@@ -239,9 +324,85 @@ def get_default_monitors(loss_op=None, summary_op=None, save_summary_steps=100,
                          output_dir=None, summary_writer=None):
   monitors = []
   if loss_op is not None:
-    monitors.append(PrintTensor([loss_op.name]))
+    monitors.append(PrintTensor(tensor_names={"loss": loss_op.name}))
   if summary_op is not None:
     monitors.append(SummarySaver(summary_op, save_steps=save_summary_steps,
                                  output_dir=output_dir,
                                  summary_writer=summary_writer))
   return monitors
+
+
+class GraphDump(BaseMonitor):
+  """Dumps almost all tensors in the graph at every step.
+
+  Note, this is very expensive, prefer `PrintTensor` or `CaptureVariable` if
+  you are not debugging.
+  """
+
+  IGNORE_OPS = ["Const", "Assign", "Identity", "Placeholder",
+                "RandomUniform", "Cast", "RestoreSlice"]
+
+  def __init__(self, ignore_ops=None):
+    """Initializes GraphDump monitor.
+
+    Args:
+      ignore_ops: `list` of string names of `Operation`s to ignore.
+          If `None` GraphDump.IGNORE_OPS list is used.
+    """
+    self.ignore_ops = ignore_ops or GraphDump.IGNORE_OPS
+    self._data = []
+
+  def begin(self, max_steps):
+    self.tensors = []
+    graph = ops.get_default_graph()
+    graph_def = graph.as_graph_def()
+    for node in graph_def.node:
+      if node.op in self.ignore_ops:
+        continue
+      try:
+        self.tensors.append(graph.get_tensor_by_name(node.name + ":0"))
+      except KeyError:
+        pass
+
+  def step_begin(self, step):
+    return self.tensors
+
+  def step_end(self, step, outputs):
+    self._data.append(outputs)
+
+  @property
+  def data(self):
+    return self._data
+
+  def compare(self, other_dump, step, atol=1e-06):
+    """Compares two `GraphDump` monitors and returns differences.
+
+    Args:
+      other_dump: Another `GraphDump` monitor.
+      step: `int`, step to compare on.
+      atol: `float`, absolute tolerance in comparison of floating arrays.
+
+    Returns:
+      Returns tuple:
+        matched: `list` of keys that matched.
+        non_matched: `dict` of keys to difference.
+    """
+    non_matched = {}
+    matched = []
+    for key in self.data[step]:
+      if not isinstance(key, str) and not isinstance(key, unicode):
+        continue
+      value1, value2 = self.data[step][key], other_dump.data[step][key]
+      if isinstance(value1, str):
+        continue
+      if isinstance(value1, np.ndarray):
+        if not np.allclose(value1, value2, atol=atol):
+          non_matched[key] = value1 - value2
+        else:
+          matched.append(key)
+      else:
+        if value1 != value2:
+          non_matched[key] = (value1, value2)
+        else:
+          matched.append(key)
+    return matched, non_matched
diff --git a/tensorflow/contrib/learn/python/learn/tests/early_stopping_test.py b/tensorflow/contrib/learn/python/learn/tests/early_stopping_test.py
index 818ea38e15c..afaf4ecbf51 100644
--- a/tensorflow/contrib/learn/python/learn/tests/early_stopping_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/early_stopping_test.py
@@ -28,6 +28,12 @@ from tensorflow.contrib.learn.python.learn.estimators._sklearn import accuracy_s
 from tensorflow.contrib.learn.python.learn.estimators._sklearn import train_test_split
 
 
+def _get_summary_events(folder):
+  if not tf.gfile.Exists(folder):
+    raise ValueError('Folder %s doesn\'t exist.' % folder)
+  return tf.contrib.testing.latest_summaries(folder)
+
+
 class EarlyStoppingTest(tf.test.TestCase):
   """Early stopping tests."""
 
@@ -35,36 +41,54 @@ class EarlyStoppingTest(tf.test.TestCase):
     random.seed(42)
 
     iris = datasets.load_iris()
-    x_train, x_test, y_train, y_test = train_test_split(iris.data,
-                                                        iris.target,
-                                                        test_size=0.2,
-                                                        random_state=42)
+    x_train, x_test, y_train, y_test = train_test_split(
+        iris.data, iris.target, test_size=0.2, random_state=42)
 
-    x_train, x_val, y_train, y_val = train_test_split(x_train,
-                                                      y_train,
-                                                      test_size=0.2)
-    val_monitor = learn.monitors.ValidationMonitor(x_val,
-                                                   y_val,
-                                                   early_stopping_rounds=100)
+    x_train, x_val, y_train, y_val = train_test_split(
+        x_train, y_train, test_size=0.2, random_state=42)
+    val_monitor = learn.monitors.ValidationMonitor(
+        x_val, y_val, every_n_steps=50, early_stopping_rounds=100,
+        early_stopping_metric='accuracy', early_stopping_metric_minimize=False)
 
     # classifier without early stopping - overfitting
-    classifier1 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
-                                                n_classes=3,
-                                                steps=1000)
+    classifier1 = learn.TensorFlowDNNClassifier(
+        hidden_units=[10, 20, 10], n_classes=3, steps=1000)
     classifier1.fit(x_train, y_train)
-    accuracy_score(y_test, classifier1.predict(x_test))
+    _ = accuracy_score(y_test, classifier1.predict(x_test))
+
+    # Full 1000 steps, 11 summaries and no evaluation summary.
+    # 11 summaries = first + every 100 out of 1000 steps.
+    self.assertEqual(11, len(_get_summary_events(classifier1.model_dir)))
+    with self.assertRaises(ValueError):
+      _get_summary_events(classifier1.model_dir + '/eval')
 
     # classifier with early stopping - improved accuracy on testing set
-    classifier2 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
-                                                n_classes=3,
-                                                steps=1000)
+    classifier2 = learn.TensorFlowDNNClassifier(
+        hidden_units=[10, 20, 10], n_classes=3, steps=2000,
+        config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
 
     classifier2.fit(x_train, y_train, monitors=[val_monitor])
-    accuracy_score(y_test, classifier2.predict(x_test))
+    _ = accuracy_score(y_val, classifier2.predict(x_val))
+    _ = accuracy_score(y_test, classifier2.predict(x_test))
+
+    # Note, this test is unstable, so not checking for equality.
+    # See stability_test for examples of stability issues.
+    if val_monitor.early_stopped:
+      self.assertLess(val_monitor.best_step, 2000)
+      # Note, due to validation monitor stopping after the best score occur,
+      # the accuracy at current checkpoint is less.
+      # TODO(ipolosukhin): Time machine for restoring old checkpoints?
+      # flaky, still not always best_value better then score2 value.
+      # self.assertGreater(val_monitor.best_value, score2_val)
+
+      # Early stopped, unstable so checking only < then max.
+      self.assertLess(len(_get_summary_events(classifier2.model_dir)), 21)
+      self.assertLess(len(_get_summary_events(
+          classifier2.model_dir + '/eval')), 4)
 
     # TODO(ipolosukhin): Restore this?
     # self.assertGreater(score2, score1, "No improvement using early stopping.")
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/tests/experiment_test.py b/tensorflow/contrib/learn/python/learn/tests/experiment_test.py
new file mode 100644
index 00000000000..1da4b2451e5
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/tests/experiment_test.py
@@ -0,0 +1,119 @@
+# pylint: disable=g-bad-file-header
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Tests for TaskRunner and Experiment class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import tensorflow as tf
+
+
+class TestEstimator(object):
+
+  def __init__(self):
+    self.eval_count = 0
+    self.fit_count = 0
+
+  def evaluate(self, **kwargs):
+    tf.logging.info('evaluate called with args: %s' % kwargs)
+    self.eval_count += 1
+    if self.eval_count > 5:
+      tf.logging.info('Ran 6 evals. Done.')
+      raise StopIteration()
+    return [(key, kwargs[key]) for key in sorted(kwargs.keys())]
+
+  def fit(self, **kwargs):
+    tf.logging.info('fit called with args: %s' % kwargs)
+    self.fit_count += 1
+    return [(key, kwargs[key]) for key in sorted(kwargs.keys())]
+
+
+class ExperimentTest(tf.test.TestCase):
+
+  def test_train(self):
+    est = TestEstimator()
+    ex = tf.contrib.learn.Experiment(est,
+                                     train_input_fn='train_input',
+                                     eval_input_fn='eval_input',
+                                     eval_metrics='eval_metrics')
+    ex.train(delay_secs=0)
+    self.assertEquals(1, est.fit_count)
+    self.assertEquals(0, est.eval_count)
+
+  def test_train_delay(self):
+    est = TestEstimator()
+    ex = tf.contrib.learn.Experiment(est,
+                                     train_input_fn='train_input',
+                                     eval_input_fn='eval_input')
+    for delay in [0, 1, 3]:
+      start = time.time()
+      ex.train(delay_secs=delay)
+      duration = time.time() - start
+      tf.logging.info('train duration (expected %f): %f', delay, duration)
+      self.assertTrue(duration > delay - 0.5 and duration < delay + 0.5)
+
+  def test_evaluate(self):
+    est = TestEstimator()
+    ex = tf.contrib.learn.Experiment(est,
+                                     train_input_fn='train_input',
+                                     eval_input_fn='eval_input',
+                                     eval_metrics='eval_metrics')
+    ex.evaluate(steps='steps', delay_secs=0)
+    self.assertEquals(1, est.eval_count)
+    self.assertEquals(0, est.fit_count)
+
+  def test_evaluate_delay(self):
+    est = TestEstimator()
+    ex = tf.contrib.learn.Experiment(est,
+                                     train_input_fn='train_input',
+                                     eval_input_fn='eval_input')
+    for delay in [0, 1, 3]:
+      start = time.time()
+      ex.evaluate(delay_secs=delay)
+      duration = time.time() - start
+      tf.logging.info('eval duration (expected %f): %f', delay, duration)
+      self.assertTrue(duration > delay - 0.5 and duration < delay + 0.5)
+
+  def test_continuous_eval(self):
+    est = TestEstimator()
+    ex = tf.contrib.learn.Experiment(est,
+                                     train_input_fn='train_input',
+                                     eval_input_fn='eval_input',
+                                     eval_metrics='eval_metrics')
+    self.assertRaises(StopIteration, ex.continuous_eval,
+                      delay_secs=0, throttle_delay_secs=0)
+    self.assertEquals(6, est.eval_count)
+    self.assertEquals(0, est.fit_count)
+
+  def test_continuous_eval_throttle_delay(self):
+    for delay in [0, 1, 2]:
+      est = TestEstimator()
+      ex = tf.contrib.learn.Experiment(est,
+                                       train_input_fn='train_input',
+                                       eval_input_fn='eval_input',
+                                       eval_metrics='eval_metrics')
+      start = time.time()
+      self.assertRaises(StopIteration, ex.continuous_eval,
+                        delay_secs=0, throttle_delay_secs=delay)
+      duration = time.time() - start
+      expected = 5 * delay
+      tf.logging.info('eval duration (expected %f): %f', expected, duration)
+      self.assertTrue(duration > expected - 0.5 and duration < expected + 0.5)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py b/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py
new file mode 100644
index 00000000000..ef030562fff
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py
@@ -0,0 +1,107 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""learn_main tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib.learn.python.learn import learn_runner
+
+
+FLAGS = learn_runner.FLAGS
+
+
+class TestExperiment(tf.contrib.learn.Experiment):
+
+  def __init__(self, default=None):
+    self.default = default
+
+  def simple_task(self):
+    return "simple_task, default=%s." % self.default
+
+
+# pylint: disable=unused-argument
+def build_experiment(output_dir):
+  tf.logging.info("In default build_experiment.")
+  return TestExperiment()
+
+
+def build_non_experiment(output_dir):
+  return "Ceci n'est pas un Experiment."
+# pylint: enable=unused-argument
+
+
+class MainTest(tf.test.TestCase):
+
+  def setUp(self):
+    # Make sure the flags exist. It's unclear why this is necessary.
+    if not hasattr(FLAGS, "output_dir"):
+      learn_runner.flags.DEFINE_string("output_dir", "/tmp", "Fake")
+    if not hasattr(FLAGS, "schedule"):
+      learn_runner.flags.DEFINE_string("schedule", "simple_task", "Fake")
+
+  def test_run(self):
+    FLAGS.output_dir = "/tmp"
+    FLAGS.schedule = "simple_task"
+    self.assertEqual("simple_task, default=None.",
+                     learn_runner.run(build_experiment))
+
+  def test_fail_no_output_dir(self):
+    FLAGS.output_dir = ""
+    FLAGS.schedule = "simple_test"
+    self.assertRaisesRegexp(RuntimeError,
+                            "Must specify an output directory",
+                            learn_runner.run, build_experiment)
+
+  def test_fail_no_schedule(self):
+    FLAGS.output_dir = "/tmp"
+    FLAGS.schedule = ""
+    self.assertRaisesRegexp(RuntimeError, "Must specify a schedule",
+                            learn_runner.run, build_experiment)
+
+  def test_fail_non_callable(self):
+    FLAGS.output_dir = "/tmp"
+    FLAGS.schedule = "simple_test"
+    self.assertRaisesRegexp(TypeError,
+                            "Experiment builder .* is not callable",
+                            learn_runner.run, "not callable")
+
+  def test_fail_not_experiment(self):
+    FLAGS.output_dir = "/tmp"
+    FLAGS.schedule = "simple_test"
+    self.assertRaisesRegexp(
+        TypeError, "Experiment builder did not return an Experiment",
+        learn_runner.run, build_non_experiment)
+
+  def test_fail_non_existent_task(self):
+    FLAGS.output_dir = "/tmp"
+    FLAGS.schedule = "mirage"
+    self.assertRaisesRegexp(
+        ValueError, "Schedule references non-existent task",
+        learn_runner.run, build_experiment)
+
+  def test_fail_non_callable_task(self):
+    FLAGS.output_dir = "/tmp"
+    FLAGS.schedule = "default"
+    self.assertRaisesRegexp(
+        TypeError, "Schedule references non-callable member",
+        learn_runner.run, build_experiment)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/tests/stability_test.py b/tensorflow/contrib/learn/python/learn/tests/stability_test.py
new file mode 100644
index 00000000000..373656cebd4
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/tests/stability_test.py
@@ -0,0 +1,88 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Non-linear estimator tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# import random
+
+import tensorflow as tf
+
+
+class StabilityTest(tf.test.TestCase):
+  """Tests that estiamtors are reproducible."""
+
+  def testRandomStability(self):
+    my_seed, minval, maxval = 42, -0.3333, 0.3333
+    with tf.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        tf.set_random_seed(my_seed)
+        x = tf.random_uniform([10, 10], minval=minval, maxval=maxval)
+        val1 = session.run(x)
+    with tf.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        tf.set_random_seed(my_seed)
+        x = tf.random_uniform([10, 10], minval=minval, maxval=maxval)
+        val2 = session.run(x)
+    self.assertAllClose(val1, val2)
+
+  def testLinearRegression(self):
+    # TODO(ipolosukhin): This doesn't pass at all, but should...
+    pass
+#     random.seed(42)
+#     boston = tf.contrib.learn.datasets.load_boston()
+#     regressor = tf.contrib.learn.LinearRegressor()
+#     regressor.fit(x=boston.data, y=boston.target, steps=1)
+#     regressor2 = tf.contrib.learn.LinearRegressor()
+#     regressor2.fit(x=boston.data, y=boston.target, steps=1)
+#     self.assertAllClose(regressor.weights_, regressor2.weights_)
+#     self.assertAllClose(regressor.bias_, regressor2.bias_)
+#     self.assertAllClose(regressor.predict(boston.data),
+#                         regressor2.predict(boston.data), atol=1e-05)
+
+  def testDNNRegression(self):
+    # TODO(ipolosukhin): This doesn't pass at all, but should...
+    # Either bugs or just general instability.
+    pass
+#     random.seed(42)
+#     boston = tf.contrib.learn.datasets.load_boston()
+#     regressor = tf.contrib.learn.DNNRegressor(
+#         hidden_units=[10],
+#         optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.001))
+#     graph_dump = tf.contrib.learn.monitors.GraphDump()
+#     regressor.fit(x=boston.data, y=boston.target, steps=1,
+#                   monitors=[graph_dump], batch_size=1)
+#     regressor2 = tf.contrib.learn.DNNRegressor(
+#         hidden_units=[10],
+#         optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.001))
+#     graph_dump2 = tf.contrib.learn.monitors.GraphDump()
+#     regressor2.fit(x=boston.data, y=boston.target, steps=1,
+#                    monitors=[graph_dump2], batch_size=1)
+#     _, non_match = graph_dump.compare(graph_dump2, 0, atol=1e-02)
+#     self.assertEmpty(non_match.keys())
+#     for weight1, weight2 in zip(regressor.weights_, regressor2.weights_):
+#       self.assertAllClose(weight1, weight2)
+#     for bias1, bias2 in zip(regressor.bias_, regressor2.bias_):
+#       self.assertAllClose(bias1, bias2)
+#     self.assertAllClose(regressor.predict(boston.data),
+#                         regressor2.predict(boston.data), atol=1e-05)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc
new file mode 100644
index 00000000000..23ce7daeff1
--- /dev/null
+++ b/tensorflow/core/graph/quantize_training.cc
@@ -0,0 +1,229 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <atomic>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/graph/quantize_training.h"
+
+#include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/memory_types.h"
+#include "tensorflow/core/framework/log_memory.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+// Node types to rewrite. Insert quantize_and_dequantize op for their inputs.
+const std::unordered_set<string, StringPiece::Hasher> nodes_to_rewrite{
+    "MatMul", "Conv2D"};
+
+// Contains necessary parameters to convert an edge.
+struct EdgeToConvert {
+  // Edge is not owned here.
+  const Edge* edge;
+  int32 num_bits;
+  bool signed_input;
+  bool range_given;
+  float input_min;
+  float input_max;
+
+  EdgeToConvert(const Edge* e, int32 bits, bool sign, bool range, float min,
+                float max) {
+    edge = e;
+    num_bits = bits;
+    signed_input = sign;
+    range_given = range;
+    input_min = min;
+    input_max = max;
+  }
+};
+
+// Decide if a node is in backward pass by checking if its name is led by
+// "gradients".
+// TODO(jmchen): Make this check more robust as it is not guaranteed that the
+// forward node will not be named with a leading "gradients".
+inline bool IsGradientNode(const Graph* graph, const Node* node) {
+  static const string tag = "gradients";
+  return (node->name().compare(0, tag.size(), tag) == 0);
+}
+
+// Find the type of the input to set the parameters for the
+// quantize_and_dequantize op.
+// Returns true if the root tensor op type is known, false otherwise.
+bool FindType(const Graph* graph, const Node* node, bool* signed_input,
+              bool* range_given, float* input_min, float* input_max) {
+  const string src_op = node->type_string();
+  if (src_op == "Const" || src_op == "Variable") {
+    *signed_input = true;
+    *range_given = false;
+  } else if (src_op == "Relu") {
+    // Range is not given for Relu.
+    *signed_input = false;
+    *range_given = false;
+  } else if (src_op == "Relu6") {
+    *signed_input = false;
+    *range_given = true;
+    *input_min = 0;
+    *input_max = 6;
+  } else if (src_op == "Sigmoid") {
+    *signed_input = false;
+    *range_given = true;
+    *input_min = 0;
+    *input_max = 1;
+  } else if (src_op == "Tanh") {
+    *signed_input = true;
+    *range_given = true;
+    *input_min = -1;
+    *input_max = 1;
+  } else if (src_op == "Reshape") {
+    // Reshape has 2 inputs and the first one is the tensor.
+    for (const Edge* edge : node->in_edges()) {
+      if (edge->src_output() != Graph::kControlSlot && edge->dst_input() == 0) {
+        FindType(graph, edge->src(), signed_input, range_given, input_min,
+                 input_max);
+      }
+    }
+  } else if (src_op == "Identity" || src_op == "MaxPool" ||
+             src_op == "AvgPool" || src_op == "MaxPool3D" ||
+             src_op == "AvgPool3D") {
+    // All these Ops only have 1 data input.
+    for (const Edge* edge : node->in_edges()) {
+      if (edge->src_output() != Graph::kControlSlot) {
+        FindType(graph, edge->src(), signed_input, range_given, input_min,
+                 input_max);
+      }
+    }
+  } else {
+    // Unknown type, could be the model input examples.
+    // TODO: Set the params for input with user's hint.
+    *signed_input = true;
+    *range_given = false;
+    return false;
+  }
+
+  return true;
+}
+
+// Insert conversion op, connect it to the graph and remove the old edge.
+Status ProcessTargetEdges(Graph* graph,
+                          const std::vector<EdgeToConvert>& target_edges) {
+  // Remember previous convert ops to avoid duplicated conversion on the same
+  // input.
+  std::unordered_map<string, Node*, StringPiece::Hasher> name_index;
+  for (const EdgeToConvert edge : target_edges) {
+    Node* convert_node;
+    string name =
+        strings::StrCat(edge.edge->src()->name(), "/_QuantizeAndDequantize");
+
+    auto iter = name_index.find(name);
+    if (iter == name_index.end()) {
+      TF_RETURN_IF_ERROR(NodeBuilder(name, "_QuantizeAndDequantize")
+                             .Input(edge.edge->src())
+                             .Attr("signed_input", edge.signed_input)
+                             .Attr("num_bits", edge.num_bits)
+                             .Attr("range_given", edge.range_given)
+                             .Attr("input_min", edge.input_min)
+                             .Attr("input_max", edge.input_max)
+                             .Finalize(graph, &convert_node));
+
+      name_index[name] = convert_node;
+    } else {
+      convert_node = iter->second;
+    }
+
+    graph->AddEdge(convert_node, 0, edge.edge->dst(), edge.edge->dst_input());
+    graph->RemoveEdge(edge.edge);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+Status DoQuantizeTraining(int32 num_bits, Graph* graph) {
+  if (graph == nullptr) {
+    return errors::InvalidArgument("Cannot accept empty graph pointer.");
+  }
+
+  if (num_bits < 1 || num_bits > 63) {
+    return errors::OutOfRange("num_bits should be in range [1, 63] but is: ",
+                              num_bits);
+  }
+  int potential_input = 0;
+  std::vector<EdgeToConvert> target_edges;
+  for (Node* node : graph->nodes()) {
+    if (nodes_to_rewrite.find(node->type_string()) != nodes_to_rewrite.end() &&
+        !IsGradientNode(graph, node)) {
+      // Find out which types are the inputs and convert them accordingly.
+      // 1. Const/Variable OP: This is quantized as signed tensors with no given
+      // range.
+      // 2. Activation OP: Set the range accordingly for different types of
+      // activations. Currently we handle {Relu, Relu6, Sigmoid, Tanh}
+      // 3. Identity OP: The quantization parameters depend on its input.
+      // 4. Pooling OPs: various pooling ops. Also depends on its input.
+      // 5. Reshape OP: Also depends on the first input to this op.
+      // 6. Not-Listed-Above OP: If there is only 1 such op, consider it as the
+      // model input. However, if there are >1 unknown ops, then returns an
+      // error for now to avoid unexpected bahavior.
+      // Note: The list above might not be a complete list. Please let us
+      // know if you see the error so we can handle your case.
+      for (const Edge* edge : node->in_edges()) {
+        if (edge->src_output() == Graph::kControlSlot) {
+          // Skip the control dependency input.
+          continue;
+        } else {
+          bool signed_input = false;
+          bool range_given = false;
+          float input_min = 0;
+          float input_max = 0;
+          bool known_op = FindType(graph, edge->src(), &signed_input,
+                                   &range_given, &input_min, &input_max);
+          if (!known_op) {
+            // Unknown op is considered as input.
+            // Only support one input for now.
+            // TODO: Make this configurable if this is the desirable way to find
+            // input.
+            if (potential_input > 0) {
+              return errors::Unimplemented(
+                  "Find a second unknown op: ", edge->src()->name(),
+                  " with type: ", edge->src()->type_string(),
+                  "; Unknown ops are considered as model input for now and "
+                  "only 1 input is supported currently.");
+            }
+            potential_input++;
+          }
+
+          target_edges.emplace_back(EdgeToConvert(
+              edge, num_bits, signed_input, range_given, input_min, input_max));
+        }
+      }
+    }
+  }
+
+  TF_RETURN_IF_ERROR(ProcessTargetEdges(graph, target_edges));
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/graph/quantize_training.h b/tensorflow/core/graph/quantize_training.h
new file mode 100644
index 00000000000..694c491620a
--- /dev/null
+++ b/tensorflow/core/graph/quantize_training.h
@@ -0,0 +1,37 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_
+#define TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+// Rewrites graph for quantized training.
+// Rewrites the forward pass to include the precision loss with quantization so
+// the model can learn to deal with such loss and achieve better accuracy when
+// it is quantized later for inference.
+// Note that the num_bits should be in [1, 63] and 'g' must be not null.
+//
+// On success, returns OK.
+//
+// On failure, returns the error status. Possible errors include:
+//    - num_bits out of range.
+//    - g is null.
+//    - More than 1 unknown ops encountered.
+Status DoQuantizeTraining(int32 num_bits, Graph* g);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPH_QUANTIZE_TRAINING_H_
diff --git a/tensorflow/core/graph/quantize_training_test.cc b/tensorflow/core/graph/quantize_training_test.cc
new file mode 100644
index 00000000000..d6663e0a508
--- /dev/null
+++ b/tensorflow/core/graph/quantize_training_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/graph/quantize_training.h"
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+class QuantizeTrainingTest : public ::testing::Test {
+ protected:
+  QuantizeTrainingTest() { Reset(); }
+  void Reset() { g_.reset(new Graph(OpRegistry::Global())); }
+
+  template <typename T>
+  Node* Constant(gtl::ArraySlice<T> values, TensorShape shape) {
+    return test::graph::Constant(g_.get(), test::AsTensor(values, shape));
+  }
+
+  std::unique_ptr<Graph> g_;
+};
+
+TEST_F(QuantizeTrainingTest, NormalGraph) {
+  // Construct the following graph
+  /*
+           m1      m2
+        /      \ /     \
+      Relu   Identity   c
+        |       |
+        a       b
+  */
+  Reset();
+  Graph* g = g_.get();
+  Node* a = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* c = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
+  g->AddControlEdge(g->source_node(), a);
+  g->AddControlEdge(g->source_node(), b);
+  g->AddControlEdge(g->source_node(), c);
+  Node* relu = test::graph::Relu(g, a);
+  Node* identity = test::graph::Identity(g, b);
+  Node* m1 = test::graph::Matmul(g, relu, identity, false, false);
+  Node* m2 = test::graph::Matmul(g, identity, c, false, false);
+  g->AddControlEdge(m1, g->sink_node());
+  g->AddControlEdge(m2, g->sink_node());
+
+  // The graph after the rewriting should be:
+  // "Q" is the quantize_and_dequantize op.
+  // Note the Q in the middle is shared by both m1 and m2.
+  /*
+         m1       m2
+      /      \ /     \
+      Q       Q       Q
+      |       |       |
+    Relu   Identity   c
+      |       |
+      a       b
+  */
+  int num_bits = 8;
+  // 4 edges to modify
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
+
+  // There should be 12 nodes in total including the source and sink nodes.
+  EXPECT_EQ(12, g->num_nodes());
+  // Nodes m1 and m2's inputs should be the quantize_and_dequantize op.
+  std::vector<Node*> target_nodes{m1, m2};
+  for (Node* n : target_nodes) {
+    for (Node* in : n->in_nodes()) {
+      EXPECT_EQ("_QuantizeAndDequantize", in->type_string());
+    }
+  }
+
+  // relu, identity, c should now connect to the quantize_and_dequantize nodes.
+  std::vector<Node*> target_inputs{relu, identity, c};
+  for (Node* n : target_inputs) {
+    for (Node* out : n->out_nodes()) {
+      EXPECT_EQ("_QuantizeAndDequantize", out->type_string());
+    }
+  }
+
+  // Quantize_and_dequantize node for identity should have signed_input==true.
+  NodeDef identity_Q = identity->out_nodes().begin()->def();
+  ASSERT_EQ("true",
+            SummarizeAttrValue(identity_Q.attr().find("signed_input")->second));
+  // Quantize_and_dequantize node for relu should have signed_input==false.
+  NodeDef relu_Q = relu->out_nodes().begin()->def();
+  ASSERT_EQ("false",
+            SummarizeAttrValue(relu_Q.attr().find("signed_input")->second));
+}
+
+TEST_F(QuantizeTrainingTest, WithBackwardNodes) {
+  // Construct the same graph plus another backward Matmul.
+  Reset();
+  Graph* g = g_.get();
+  Node* a = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* c = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
+  g->AddControlEdge(g->source_node(), a);
+  g->AddControlEdge(g->source_node(), b);
+  g->AddControlEdge(g->source_node(), c);
+  Node* relu = test::graph::Relu(g, a);
+  Node* identity = test::graph::Identity(g, b);
+  Node* m1 = test::graph::Matmul(g, relu, identity, false, false);
+  Node* m2 = test::graph::Matmul(g, identity, c, false, false);
+  g->AddControlEdge(m1, g->sink_node());
+  g->AddControlEdge(m2, g->sink_node());
+
+  // Add a Matmul node with name starting with "gradients".
+  Node* backward_m;
+  TF_ASSERT_OK(NodeBuilder(g->NewName("gradients/n"), "MatMul")
+                   .Input(m1)
+                   .Input(m2)
+                   .Attr("transpose_a", true)
+                   .Attr("transpose_b", false)
+                   .Finalize(g, &backward_m));
+  g->AddControlEdge(backward_m, g->sink_node());
+
+  int num_bits = 8;
+  // Still 4 changes since the inputs of backward node will not be converted.
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
+
+  // Nodes m1 and m2's inputs should now be the quantize_and_dequantize op.
+  EXPECT_EQ(13, g->num_nodes());
+  EXPECT_EQ(2, m2->num_inputs());
+}
+
+#undef SIMPLE_GRAPH
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 0d0a84db799..ec878437dc8 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -384,6 +384,15 @@ Node* GetSessionTensor(Graph* g, Node* in) {
   return ret;
 }
 
+Node* Relu(Graph* g, Node* in) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Relu")
+                  .Input(in, 0)
+                  .Attr("T", DT_FLOAT)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 void ToGraphDef(Graph* g, GraphDef* gdef) { g->ToGraphDef(gdef); }
 
 }  // end namespace graph
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index 511f6b4310c..bc4863563f9 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -169,6 +169,9 @@ Node* GetSessionTensor(Graph* g, Node* in);
 // given in "tensors".
 Node* Concat(Graph* g, Node* concat_dim, gtl::ArraySlice<Node*> tensors);
 
+// Add a Relu node in "g".
+Node* Relu(Graph* g, Node* in);
+
 }  // end namespace graph
 }  // end namespace test
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index fc7f6d1a5a4..d666546602e 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@@ -99,10 +100,12 @@ class AvgPoolingOp : public UnaryOp<T> {
   TensorFormat data_format_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("AvgPool")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        AvgPoolingOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    AvgPoolingOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("AvgPool").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
+    AvgPoolingOp<CPUDevice, Eigen::half>);
 
 #if GOOGLE_CUDA
 template <typename T>
@@ -181,14 +184,17 @@ namespace functor {
       const Eigen::PaddingType& padding);                        \
   extern template struct SpatialAvgPooling<GPUDevice, T>;
 
+DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
-REGISTER_KERNEL_BUILDER(Name("AvgPool")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T"),
-                        AvgPoolingOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+    AvgPoolingOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(
+    Name("AvgPool").Device(DEVICE_GPU).TypeConstraint<float>("T"),
+    AvgPoolingOp<GPUDevice, float>);
 #endif  // GOOGLE_CUDA
 
 // The operation to compute AvgPool gradients.
@@ -300,7 +306,7 @@ class AvgPoolingGradOp : public OpKernel {
                            GetBroadcastSize(c, in_cols, window_cols, col_stride,
                                             pad_cols, &cindex, &csize));
 
-            T divide_coeff = 1.0 / (rsize * csize);
+            T divide_coeff(1.0 / (rsize * csize));
             int64 output_index =
                 (b * out_backprop_rows + r) * out_backprop_cols + c;
             for (int64 r_dst = rindex; r_dst < rindex + rsize; ++r_dst) {
@@ -337,16 +343,16 @@ class AvgPoolingGradOp : public OpKernel {
   TensorFormat data_format_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("orig_input_shape"),
-                        AvgPoolingGradOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T")
-                            .HostMemory("orig_input_shape"),
-                        AvgPoolingGradOp<CPUDevice, double>);
+#define REGISTER_CPU_KERNEL(T)                                 \
+  REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")                  \
+                              .Device(DEVICE_CPU)              \
+                              .TypeConstraint<T>("T")          \
+                              .HostMemory("orig_input_shape"), \
+                          AvgPoolingGradOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+TF_CALL_half(REGISTER_CPU_KERNEL);
 
 #if GOOGLE_CUDA
 
@@ -416,6 +422,12 @@ REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
                             .HostMemory("orig_input_shape")
                             .Label("cudnn"),
                         AvgPoolingGradOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::half>("T")
+                            .HostMemory("orig_input_shape")
+                            .Label("cudnn"),
+                        AvgPoolingGradOp<GPUDevice, Eigen::half>);
 
 // A custom GPU kernel based AvgPoolingGrad implementation. It includes the
 // padding as the candidates for the pooling operation.
@@ -532,6 +544,11 @@ REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
                             .TypeConstraint<float>("T")
                             .HostMemory("orig_input_shape"),
                         AvgPoolingGradOpCustomGPUKernel<float>);
+REGISTER_KERNEL_BUILDER(Name("AvgPoolGrad")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::half>("T")
+                            .HostMemory("orig_input_shape"),
+                        AvgPoolingGradOpCustomGPUKernel<Eigen::half>);
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
index 9e894b1734d..a190b2168a7 100644
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@@ -33,6 +33,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #define DEFINE_GPU_KERNELS(T) \
   template struct functor::SpatialAvgPooling<GPUDevice, T>;
 
+DEFINE_GPU_KERNELS(Eigen::half)
 DEFINE_GPU_KERNELS(float)
 
 #undef DEFINE_GPU_KERNELS
@@ -57,7 +58,7 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
     const int phend = min(h / stride_h + 1, pooled_height);
     const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
     const int pwend = min(w / stride_w + 1, pooled_width);
-    dtype gradient = 0;
+    dtype gradient(0);
     const dtype* const top_diff_slice =
         top_diff + n * pooled_height * pooled_width * channels + c;
     for (int ph = phstart; ph < phend; ++ph) {
@@ -104,6 +105,12 @@ template bool RunAvePoolBackwardNHWC(
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
     float* const bottom_diff, const GPUDevice& d);
+template bool RunAvePoolBackwardNHWC(
+    const Eigen::half* const top_diff, const int num, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    Eigen::half* const bottom_diff, const GPUDevice& d);
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc
index a5f526780f2..f4aa7596435 100644
--- a/tensorflow/core/kernels/batch_norm_op.cc
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@@ -159,9 +159,9 @@ class BatchNormGradOp : public OpKernel {
                               .TypeConstraint<T>("T"),             \
                           BatchNormOp<CPUDevice, T>);
 
-REGISTER_KERNEL(Eigen::half);
-REGISTER_KERNEL(float);
-REGISTER_KERNEL(double);
+TF_CALL_half(REGISTER_KERNEL);
+TF_CALL_float(REGISTER_KERNEL);
+TF_CALL_double(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
@@ -179,8 +179,8 @@ namespace functor {
 
 #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
 
-DECLARE_GPU_SPECS(Eigen::half);
-DECLARE_GPU_SPECS(float);
+TF_CALL_half(DECLARE_GPU_SPECS);
+TF_CALL_float(DECLARE_GPU_SPECS);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -191,8 +191,8 @@ DECLARE_GPU_SPECS(float);
                               .TypeConstraint<T>("T"),             \
                           BatchNormOp<GPUDevice, T>);
 
-REGISTER_GPU_KERNEL(Eigen::half);
-REGISTER_GPU_KERNEL(float);
+TF_CALL_half(REGISTER_GPU_KERNEL);
+TF_CALL_float(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 #endif  // GOOGLE_CUDA
@@ -203,9 +203,9 @@ REGISTER_GPU_KERNEL(float);
                               .TypeConstraint<T>("T"),                 \
                           BatchNormGradOp<CPUDevice, T>);
 
-REGISTER_KERNEL(Eigen::half);
-REGISTER_KERNEL(float);
-REGISTER_KERNEL(double);
+TF_CALL_half(REGISTER_KERNEL);
+TF_CALL_float(REGISTER_KERNEL);
+TF_CALL_double(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
@@ -226,8 +226,8 @@ namespace functor {
 
 #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
 
-DECLARE_GPU_SPECS(Eigen::half);
-DECLARE_GPU_SPECS(float);
+TF_CALL_half(DECLARE_GPU_SPECS);
+TF_CALL_float(DECLARE_GPU_SPECS);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -238,8 +238,8 @@ DECLARE_GPU_SPECS(float);
                               .TypeConstraint<T>("T"),                 \
                           BatchNormGradOp<GPUDevice, T>);
 
-REGISTER_GPU_KERNEL(Eigen::half);
-REGISTER_GPU_KERNEL(float);
+TF_CALL_half(REGISTER_GPU_KERNEL);
+TF_CALL_float(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc
index bc322ed139f..1d8874b4dfe 100644
--- a/tensorflow/core/kernels/check_numerics_op.cc
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <numeric>
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 
@@ -182,18 +183,14 @@ class CheckNumericsOp<GPUDevice, T> : public OpKernel {
 
 }  // namespace
 
-REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<Eigen::half>("T"),
-                        CheckNumericsOp<CPUDevice, Eigen::half>);
-REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        CheckNumericsOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T"),
-                        CheckNumericsOp<CPUDevice, double>);
+#define REGISTER_CPU_KERNEL(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("CheckNumerics").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      CheckNumericsOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU_KERNEL);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("CheckNumerics")
                             .Device(DEVICE_GPU)
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
index f9b7ed6ace1..014a3d78a94 100644
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@@ -622,35 +623,24 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropInputOp);
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("Conv2DBackpropInput").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Conv2DCustomBackpropInputOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<Eigen::half>("T"),
-                        Conv2DCustomBackpropInputOp<CPUDevice, Eigen::half>);
+#define REGISTER_CPU_KERNELS(T)                                              \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("Conv2DBackpropInput").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv2DCustomBackpropInputOp<CPUDevice, T>);                            \
+  REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")                        \
+                              .Device(DEVICE_CPU)                            \
+                              .Label("custom")                               \
+                              .TypeConstraint<T>("T"),                       \
+                          Conv2DCustomBackpropInputOp<CPUDevice, T>);        \
+  REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")                        \
+                              .Device(DEVICE_CPU)                            \
+                              .Label("eigen_tensor")                         \
+                              .TypeConstraint<T>("T"),                       \
+                          Conv2DFastBackpropInputOp<CPUDevice, T>);
 
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
-                            .Device(DEVICE_CPU)
-                            .Label("custom")
-                            .TypeConstraint<float>("T"),
-                        Conv2DCustomBackpropInputOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
-                            .Device(DEVICE_CPU)
-                            .Label("custom")
-                            .TypeConstraint<Eigen::half>("T"),
-                        Conv2DCustomBackpropInputOp<CPUDevice, Eigen::half>);
-
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
-                            .Device(DEVICE_CPU)
-                            .Label("eigen_tensor")
-                            .TypeConstraint<float>("T"),
-                        Conv2DFastBackpropInputOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
-                            .Device(DEVICE_CPU)
-                            .Label("eigen_tensor")
-                            .TypeConstraint<Eigen::half>("T"),
-                        Conv2DFastBackpropInputOp<CPUDevice, Eigen::half>);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
 
 template <typename Device, class T>
 class Conv2DFastBackpropFilterOp : public OpKernel {
@@ -867,35 +857,24 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropFilterOp);
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("Conv2DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Conv2DCustomBackpropFilterOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<Eigen::half>("T"),
-                        Conv2DCustomBackpropFilterOp<CPUDevice, Eigen::half>);
+#define REGISTER_CPU_KERNELS(T)                                               \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Conv2DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv2DCustomBackpropFilterOp<CPUDevice, T>);                            \
+  REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")                        \
+                              .Device(DEVICE_CPU)                             \
+                              .Label("custom")                                \
+                              .TypeConstraint<T>("T"),                        \
+                          Conv2DCustomBackpropFilterOp<CPUDevice, T>);        \
+  REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")                        \
+                              .Device(DEVICE_CPU)                             \
+                              .Label("eigen_tensor")                          \
+                              .TypeConstraint<T>("T"),                        \
+                          Conv2DFastBackpropFilterOp<CPUDevice, T>);
 
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
-                            .Device(DEVICE_CPU)
-                            .Label("custom")
-                            .TypeConstraint<float>("T"),
-                        Conv2DCustomBackpropFilterOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
-                            .Device(DEVICE_CPU)
-                            .Label("custom")
-                            .TypeConstraint<Eigen::half>("T"),
-                        Conv2DCustomBackpropFilterOp<CPUDevice, Eigen::half>);
-
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
-                            .Device(DEVICE_CPU)
-                            .Label("eigen_tensor")
-                            .TypeConstraint<float>("T"),
-                        Conv2DFastBackpropFilterOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
-                            .Device(DEVICE_CPU)
-                            .Label("eigen_tensor")
-                            .TypeConstraint<Eigen::half>("T"),
-                        Conv2DFastBackpropFilterOp<CPUDevice, Eigen::half>);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
 
 // GPU definitions of both ops.
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index d75bc026cd3..af6048a98bf 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@@ -194,14 +195,13 @@ class Conv3DBackpropInputOp : public OpKernel {
   Padding padding_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("Conv3DBackpropInput").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Conv3DBackpropInputOp<CPUDevice, float>);
-#ifndef IS_MOBILE_PLATFORM
-REGISTER_KERNEL_BUILDER(
-    Name("Conv3DBackpropInput").Device(DEVICE_CPU).TypeConstraint<double>("T"),
-    Conv3DBackpropInputOp<CPUDevice, double>);
-#endif
+#define REGISTER_CPU_KERNEL(T)                                               \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("Conv3DBackpropInput").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv3DBackpropInputOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL
 
 // Backprop for filter.
 template <typename Device, class T>
@@ -303,14 +303,13 @@ class Conv3DBackpropFilterOp : public OpKernel {
   Padding padding_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Conv3DBackpropFilterOp<CPUDevice, float>);
-#ifndef IS_MOBILE_PLATFORM
-REGISTER_KERNEL_BUILDER(
-    Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<double>("T"),
-    Conv3DBackpropFilterOp<CPUDevice, double>);
-#endif
+#define REGISTER_CPU_KERNEL(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Conv3DBackpropFilter").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv3DBackpropFilterOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL
 
 // GPU definitions of both ops.
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 3a8ecacf93b..c64c6cd35c1 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@@ -245,12 +246,13 @@ class Conv2DOp : public BinaryOp<T> {
   TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp);
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Conv2DOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
-    Conv2DOp<CPUDevice, Eigen::half>);
+#define REGISTER_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv2DOp<CPUDevice, T>);
+
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
 #if GOOGLE_CUDA
 
 int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 35dd92e3159..697b3f62679 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@@ -120,15 +121,13 @@ class Conv3DOp : public BinaryOp<T> {
   Padding padding_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    Conv3DOp<CPUDevice, float>);
-
-#ifndef IS_MOBILE_PLATFORM
-REGISTER_KERNEL_BUILDER(
-    Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<double>("T"),
-    Conv3DOp<CPUDevice, double>);
-#endif
+#define REGISTER_CPU_KERNEL(T)                                  \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      Conv3DOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL
 
 #if GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index ffc6eeb809f..161c88d8145 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -566,16 +566,14 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
 };
 
-REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        DepthwiseConv2dNativeBackpropInputOp<CPUDevice, float>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNativeBackpropInput")
-        .Device(DEVICE_CPU)
-        .TypeConstraint<double>("T"),
-    DepthwiseConv2dNativeBackpropInputOp<CPUDevice, double>);
+#define REGISTER_CPU_KERNEL(T)                                       \
+  REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T"),               \
+                          DepthwiseConv2dNativeBackpropInputOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput")
@@ -951,17 +949,15 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNativeBackpropFilter")
-        .Device(DEVICE_CPU)
-        .TypeConstraint<float>("T"),
-    DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, float>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNativeBackpropFilter")
-        .Device(DEVICE_CPU)
-        .TypeConstraint<double>("T"),
-    DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, double>);
+#define REGISTER_CPU_KERNEL(T)                    \
+  REGISTER_KERNEL_BUILDER(                        \
+      Name("DepthwiseConv2dNativeBackpropFilter") \
+          .Device(DEVICE_CPU)                     \
+          .TypeConstraint<T>("T"),                \
+      DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index c96365f4f02..4bee59aecd6 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -376,14 +376,13 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeOp);
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    DepthwiseConv2dNativeOp<CPUDevice, float>);
+#define REGISTER_CPU_KERNEL(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      DepthwiseConv2dNativeOp<CPUDevice, T>);
 
-REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T"),
-                        DepthwiseConv2dNativeOp<CPUDevice, double>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/draw_bounding_box_op.cc b/tensorflow/core/kernels/draw_bounding_box_op.cc
index 5fb2c9e471e..a825c5bb10f 100644
--- a/tensorflow/core/kernels/draw_bounding_box_op.cc
+++ b/tensorflow/core/kernels/draw_bounding_box_op.cc
@@ -143,13 +143,11 @@ class DrawBoundingBoxesOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("DrawBoundingBoxes").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    DrawBoundingBoxesOp<float>);
-
-REGISTER_KERNEL_BUILDER(Name("DrawBoundingBoxes")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<Eigen::half>("T"),
-                        DrawBoundingBoxesOp<Eigen::half>);
+#define REGISTER_CPU_KERNEL(T)                                             \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("DrawBoundingBoxes").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      DrawBoundingBoxesOp<T>);
+TF_CALL_half(REGISTER_CPU_KERNEL);
+TF_CALL_float(REGISTER_CPU_KERNEL);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 349cbf9d0e8..aa3b2748935 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -309,7 +309,7 @@ struct AvgPoolMeanReducer {
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE AvgPoolMeanReducer() : scalarCount_(0) {
     typedef typename packet_traits<T>::type Packet;
-    packetCount_ = pset1<Packet>(0.0);
+    packetCount_ = pset1<Packet>(T(0.0));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index 6d956f4e3ed..ac1a5fea4d2 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 
 #if GOOGLE_CUDA
@@ -202,17 +203,19 @@ struct MatMulFunctor<CPUDevice, T> {
                               .Label("cublas"),                    \
                           MatMulOp<GPUDevice, T, true /* cublas */>)
 
-REGISTER_CPU(float);
-REGISTER_CPU(double);
-REGISTER_CPU(int32);
-REGISTER_CPU(Eigen::half);
-REGISTER_CPU(complex64);
-REGISTER_CPU(complex128);
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
+TF_CALL_half(REGISTER_CPU);
+
+TF_CALL_int32(REGISTER_CPU);
+TF_CALL_complex64(REGISTER_CPU);
+TF_CALL_complex128(REGISTER_CPU);
+
 #if GOOGLE_CUDA
-REGISTER_GPU(float);
-REGISTER_GPU(double);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
 #if CUDA_VERSION >= 7050
-REGISTER_GPU(Eigen::half);
+TF_CALL_half(REGISTER_GPU);
 #endif
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 5e3f2196992..f883acf3d6a 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -160,7 +160,7 @@ static void SpatialMaxPoolWithArgMaxHelper(
       const int in_end = limit * in_size;
       EigenMatrixMap in_shard(input_backprop_flat.data() + in_start, 1,
                               in_end - in_start);
-      in_shard.setConstant(0);
+      in_shard.setConstant(T(0));
 
       // Backpropagate.
       const int out_size = out_height * out_width * depth;
@@ -187,8 +187,12 @@ static void SpatialMaxPoolWithArgMaxHelper(
         params.tensor_in_batch, shard_cost, shard);
 }
 
-REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_CPU),
-                        MaxPoolingOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    MaxPoolingOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
+    MaxPoolingOp<CPUDevice, Eigen::half>);
 
 #if GOOGLE_CUDA
 // Forward declarations for the functor specializations for GPU.
@@ -212,6 +216,7 @@ DECLARE_GPU_SPEC(float);
 // kernel_label_map.
 REGISTER_KERNEL_BUILDER(Name("MaxPool")
                             .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T")
                             .Label("eigen_tensor"),
                         MaxPoolingOp<Eigen::GpuDevice, float>);
 #endif  // GOOGLE_CUDA
@@ -297,11 +302,16 @@ class MaxPoolingGradOp : public OpKernel {
   TensorFormat data_format_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_CPU),
-                        MaxPoolingGradOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    MaxPoolingGradOp<CPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
+    MaxPoolingGradOp<CPUDevice, Eigen::half>);
 
 #ifdef GOOGLE_CUDA
 
+template <typename T>
 static void MaxPoolingBackwardCustomKernel(
     OpKernelContext* context, const std::vector<int32>& size,
     const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in,
@@ -318,12 +328,12 @@ static void MaxPoolingBackwardCustomKernel(
   }
 
   MaxPoolBackwardNoMask(
-      tensor_in->flat<float>().data(), params.tensor_in_batch,
+      tensor_in->flat<T>().data(), params.tensor_in_batch,
       params.tensor_in_rows, params.tensor_in_cols, params.depth,
       params.out_height, params.out_width, params.window_rows,
       params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
-      params.pad_cols, out_backprop.flat<float>().data(),
-      output->flat<float>().data(), context->eigen_device<Eigen::GpuDevice>());
+      params.pad_cols, out_backprop.flat<T>().data(),
+      output->flat<T>().data(), context->eigen_device<Eigen::GpuDevice>());
 }
 
 template <class T>
@@ -378,8 +388,8 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
           << "Non-Cudnn MaxPoolGrad only supports NHWC format";
-      MaxPoolingBackwardCustomKernel(context, ksize_, stride_, padding_,
-                                     &tensor_in, out_backprop, output_shape);
+      MaxPoolingBackwardCustomKernel<T>(context, ksize_, stride_, padding_,
+                                        &tensor_in, out_backprop, output_shape);
     }
   }
 
@@ -391,8 +401,12 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
   bool use_dnn_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_GPU),
-                        MaxPoolingGradOp<Eigen::GpuDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint<float>("T"),
+    MaxPoolingGradOp<Eigen::GpuDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+    MaxPoolingGradOp<Eigen::GpuDevice, Eigen::half>);
 
 #endif  // GOOGLE_CUDA
 
@@ -625,8 +639,12 @@ struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_GPU),
-                        MaxPoolingNoMaskOp<Eigen::GpuDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<float>("T"),
+    MaxPoolingNoMaskOp<Eigen::GpuDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+    MaxPoolingNoMaskOp<Eigen::GpuDevice, Eigen::half>);
 
 template <typename T>
 struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
@@ -649,8 +667,14 @@ struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
 
 REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int64>("Targmax"),
+                            .TypeConstraint<int64>("Targmax")
+                            .TypeConstraint<float>("T"),
                         MaxPoolingWithArgmaxOp<Eigen::GpuDevice, float>);
+REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int64>("Targmax")
+                            .TypeConstraint<Eigen::half>("T"),
+                        MaxPoolingWithArgmaxOp<Eigen::GpuDevice, Eigen::half>);
 
 template <typename T>
 struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
@@ -675,10 +699,18 @@ struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<int64>("Targmax"),
-                        MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPoolGradWithArgmax")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<float>("T")
+        .TypeConstraint<int64>("Targmax"),
+    MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("MaxPoolGradWithArgmax")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<Eigen::half>("T")
+        .TypeConstraint<int64>("Targmax"),
+    MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, Eigen::half>);
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 1bdca42f4e7..91b50b1e111 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -110,7 +110,7 @@ __global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data,
     int wend = min(wstart + kernel_w, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    dtype maxval = -FLT_MAX;
+    dtype maxval = Eigen::NumTraits<dtype>::lowest();
     int maxidx = -1;
     const dtype* bottom_data_n = bottom_data + n * height * width * channels;
     for (int h = hstart; h < hend; ++h) {
@@ -149,7 +149,7 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
     int wend = min(wstart + kernel_w, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    dtype maxval = -FLT_MAX;
+    dtype maxval = Eigen::NumTraits<dtype>::lowest();
     int maxidx = -1;
     const dtype* bottom_data_n = bottom_data + n * height * width * channels;
     for (int h = hstart; h < hend; ++h) {
@@ -165,8 +165,8 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
     // Atomically accumulate the bottom diff. The index could still be
     // uninitialized, if all the bottom_data are NaN.
     if (maxidx != -1) {
-      atomicAdd(bottom_diff + n * height * width * channels + maxidx,
-                top_diff[index]);
+      CudaAtomicAdd(bottom_diff + n * height * width * channels + maxidx,
+                    top_diff[index]);
     }
   }
 }
@@ -185,8 +185,8 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
 //     bottom_offset: the pre-computed per-image offset of the maxpool input.
 //         This is equal to H*W*C.
 //     bottom_diff: the gradient with respect to the input.
-// This function relies on atomicAdd to avoid race conditions. Also, before the
-// kernel is run, you will need to make sure that bottom_diff is filled with
+// This function relies on CudaAtomicAdd to avoid race conditions. Also, before
+// the kernel is run, you will need to make sure that bottom_diff is filled with
 // zero first.
 template <typename dtype>
 __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
@@ -194,8 +194,8 @@ __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
                                 const int bottom_offset, dtype* bottom_diff) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int image_id = (index / top_offset);
-    atomicAdd(bottom_diff + image_id * bottom_offset + mask[index],
-              top_diff[index]);
+    CudaAtomicAdd(bottom_diff + image_id * bottom_offset + mask[index],
+                  top_diff[index]);
   }
 }
 
@@ -219,6 +219,23 @@ bool MaxPoolForwardWithOptionalArgmax(
   return d.ok();
 }
 
+bool MaxPoolForwardWithOptionalArgmax(
+    const Eigen::half* bottom_data, const int batch, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    Eigen::half* top_data, int64* mask, const Eigen::GpuDevice& d) {
+  const int kThreadsPerBlock = 1024;
+  const int output_size = batch * channels * pooled_height * pooled_width;
+
+  MaxPoolForwardNHWC<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+                       kThreadsPerBlock, 0, d.stream()>>>(
+      output_size, bottom_data, height, width, channels, pooled_height,
+      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
+      top_data, mask);
+  return d.ok();
+}
+
 bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
                            const int height, const int width,
                            const int channels, const int pooled_height,
@@ -243,6 +260,30 @@ bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
   return d.ok();
 }
 
+bool MaxPoolBackwardNoMask(const Eigen::half* bottom_data, const int batch,
+                           const int height, const int width,
+                           const int channels, const int pooled_height,
+                           const int pooled_width, const int kernel_h,
+                           const int kernel_w, const int stride_h,
+                           const int stride_w, const int pad_t, const int pad_l,
+                           const Eigen::half* top_diff, Eigen::half* bottom_diff,
+                           const Eigen::GpuDevice& d) {
+  const int kThreadsPerBlock = 1024;
+  const int bottom_size = batch * channels * height * width;
+  const int top_size = batch * channels * pooled_height * pooled_width;
+
+  SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+            kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
+
+  MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) /
+                                  kThreadsPerBlock,
+                              kThreadsPerBlock, 0, d.stream()>>>(
+      top_size, bottom_data, height, width, channels, pooled_height,
+      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
+      top_diff, bottom_diff);
+  return d.ok();
+}
+
 bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
                                const float* top_diff, const int64* mask,
                                const int top_offset, const int bottom_offset,
@@ -256,12 +297,27 @@ bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
   return d.ok();
 }
 
+bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
+                               const Eigen::half* top_diff, const int64* mask,
+                               const int top_offset, const int bottom_offset,
+                               Eigen::half* bottom_diff,
+                               const Eigen::GpuDevice& d) {
+  const int kThreadsPerBlock = 1024;
+  SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+            kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
+  MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+                    kThreadsPerBlock, 0, d.stream()>>>(
+      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+  return d.ok();
+}
+
 typedef Eigen::GpuDevice GPUDevice;
 
 #define DEFINE_GPU_KERNELS(T) \
   template struct functor::SpatialMaxPooling<GPUDevice, T>;
 
 DEFINE_GPU_KERNELS(float)
+DEFINE_GPU_KERNELS(Eigen::half)
 
 #undef DEFINE_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
index 05e865f81c0..d1c73a372e9 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -37,11 +37,24 @@ bool MaxPoolForwardWithOptionalArgmax(
     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
     float* top_data, int64* mask, const Eigen::GpuDevice& d);
 
+bool MaxPoolForwardWithOptionalArgmax(
+    const Eigen::half* bottom_data, const int batch, const int height,
+    const int width, const int channels, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
+    Eigen::half* top_data, int64* mask, const Eigen::GpuDevice& d);
+
 bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
                                const float* top_diff, const int64* mask,
                                const int top_offset, const int bottom_offset,
                                float* bottom_diff, const Eigen::GpuDevice& d);
 
+bool MaxPoolBackwardWithArgmax(const int output_size, const int input_size,
+                               const Eigen::half* top_diff, const int64* mask,
+                               const int top_offset, const int bottom_offset,
+                               Eigen::half* bottom_diff,
+                               const Eigen::GpuDevice& d);
+
 bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
                            const int height, const int width,
                            const int channels, const int pooled_height,
@@ -51,6 +64,15 @@ bool MaxPoolBackwardNoMask(const float* bottom_data, const int batch,
                            const float* top_diff, float* bottom_diff,
                            const Eigen::GpuDevice& d);
 
+bool MaxPoolBackwardNoMask(const Eigen::half* bottom_data, const int batch,
+                           const int height, const int width,
+                           const int channels, const int pooled_height,
+                           const int pooled_width, const int kernel_h,
+                           const int kernel_w, const int stride_h,
+                           const int stride_w, const int pad_t, const int pad_l,
+                           const Eigen::half* top_diff, Eigen::half* bottom_diff,
+                           const Eigen::GpuDevice& d);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_MAXPOOLING_OP_GPU_H_
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index c2d2cf3b65e..2f8b4515d09 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -104,10 +104,8 @@ class PackOp : public OpKernel {
       PackOp<CPUDevice, type>)
 
 TF_CALL_ALL_TYPES(REGISTER_PACK);
-REGISTER_PACK(quint8);
-REGISTER_PACK(qint8);
-REGISTER_PACK(qint32);
-REGISTER_PACK(bfloat16);
+TF_CALL_QUANTIZED_TYPES(REGISTER_PACK);
+TF_CALL_bfloat16(REGISTER_PACK);
 
 #undef REGISTER_PACK
 
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 3867cc824f8..f5d7771af7f 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -124,6 +124,7 @@ namespace functor {
   extern template struct TransformDepth<GPUDevice, T, Eigen::DenseIndex>;
 
 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(Eigen::half);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -368,7 +369,9 @@ void DnnPoolingGradOp<T>::Compute(
   }
 }
 
+template class DnnPoolingOp<Eigen::half>;
 template class DnnPoolingOp<float>;
+template class DnnPoolingGradOp<Eigen::half>;
 template class DnnPoolingGradOp<float>;
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index 138d1cb2ca6..593c90b0097 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -311,7 +311,7 @@ void SpatialAvgPool(OpKernelContext* context, Tensor* output,
       }
     }
   }
-  DCHECK_GT(out_count.minCoeff(), 0);
+  DCHECK_GT(out_count.minCoeff(), T(0));
   out_mat.array().rowwise() /= out_count.transpose().array();
 }
 
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index 2c6e799a2db..98887ce9c3e 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -107,15 +108,14 @@ struct QuantizeAndDequantizeOneScaleFunctor<CPUDevice, T> {
 };
 }  // namespace functor
 
-REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        QuantizeAndDequantizeOp<CPUDevice, float>);
-
-REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T"),
-                        QuantizeAndDequantizeOp<CPUDevice, double>);
+#define REGISTER_CPU_KERNEL(T)                           \
+  REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize") \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<T>("T"),   \
+                          QuantizeAndDequantizeOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("_QuantizeAndDequantize")
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index b42ef1d31d5..ed559142db9 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -451,11 +451,11 @@ class MultinomialOp : public OpKernel {
                               .TypeConstraint<IntType>("Tout"), \
                           RandomUniformIntOp<CPUDevice, IntType>);
 
-REGISTER(Eigen::half);
-REGISTER(float);
-REGISTER(double);
-REGISTER_INT(int32);
-REGISTER_INT(int64);
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+TF_CALL_int32(REGISTER_INT);
+TF_CALL_int64(REGISTER_INT);
 
 #undef REGISTER
 #undef REGISTER_INT
@@ -505,11 +505,11 @@ REGISTER_INT(int64);
                               .TypeConstraint<IntType>("Tout"), \
                           RandomUniformIntOp<GPUDevice, IntType>);
 
-REGISTER(Eigen::half);
-REGISTER(float);
-REGISTER(double);
-REGISTER_INT(int32);
-REGISTER_INT(int64);
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+TF_CALL_int32(REGISTER_INT);
+TF_CALL_int64(REGISTER_INT);
 
 #undef REGISTER
 #undef REGISTER_INT
diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
index 9539f93644a..26af8c9c2c6 100644
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -25,8 +25,8 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 // NOTE: We should have mean(complex64,int32), too. But that needs to
 // change Eigen::internal::MeanReducer to cast int to complex<float>.
 // We don't see immediate need of mean(complex64,int32) anyway.
-REGISTER_CPU_KERNELS(complex64);
-REGISTER_CPU_KERNELS(complex128);
+TF_CALL_complex64(REGISTER_CPU_KERNELS);
+TF_CALL_complex128(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc
index 9df91d13811..606c6c8a9ce 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@@ -159,15 +159,12 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL);
 
 #undef REGISTER_KERNEL
 
-REGISTER_KERNEL_BUILDER(Name("ResizeBilinearGrad")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        ResizeBilinearOpGrad<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("ResizeBilinearGrad")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<Eigen::half>("T"),
-                        ResizeBilinearOpGrad<CPUDevice, Eigen::half>);
-REGISTER_KERNEL_BUILDER(
-    Name("ResizeBilinearGrad").Device(DEVICE_CPU).TypeConstraint<double>("T"),
-    ResizeBilinearOpGrad<CPUDevice, double>);
+#define REGISTER_CPU_GRAD_KERNEL(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("ResizeBilinearGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      ResizeBilinearOpGrad<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU_GRAD_KERNEL);
+TF_CALL_float(REGISTER_CPU_GRAD_KERNEL);
+TF_CALL_double(REGISTER_CPU_GRAD_KERNEL);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 9d4c3a2a556..18fb4805156 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -97,13 +97,13 @@ class ReverseOp : public OpKernel {
                               .HostMemory("dims"),    \
                           ReverseOp<CPUDevice, T>)
 
-REGISTER_KERNEL(uint8);
-REGISTER_KERNEL(int8);
-REGISTER_KERNEL(int32);
-REGISTER_KERNEL(bool);
-REGISTER_KERNEL(Eigen::half);
-REGISTER_KERNEL(float);
-REGISTER_KERNEL(double);
+TF_CALL_uint8(REGISTER_KERNEL);
+TF_CALL_int8(REGISTER_KERNEL);
+TF_CALL_int32(REGISTER_KERNEL);
+TF_CALL_bool(REGISTER_KERNEL);
+TF_CALL_half(REGISTER_KERNEL);
+TF_CALL_float(REGISTER_KERNEL);
+TF_CALL_double(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
@@ -129,13 +129,13 @@ namespace functor {
   DECLARE_GPU_SPEC_DIM(T, 7) \
   DECLARE_GPU_SPEC_DIM(T, 8)
 
-DECLARE_GPU_SPEC(uint8);
-DECLARE_GPU_SPEC(int8);
-DECLARE_GPU_SPEC(int32);
-DECLARE_GPU_SPEC(bool);
-DECLARE_GPU_SPEC(Eigen::half);
-DECLARE_GPU_SPEC(float);
-DECLARE_GPU_SPEC(double);
+TF_CALL_uint8(DECLARE_GPU_SPEC);
+TF_CALL_int8(DECLARE_GPU_SPEC);
+TF_CALL_int32(DECLARE_GPU_SPEC);
+TF_CALL_bool(DECLARE_GPU_SPEC);
+TF_CALL_half(DECLARE_GPU_SPEC);
+TF_CALL_float(DECLARE_GPU_SPEC);
+TF_CALL_double(DECLARE_GPU_SPEC);
 #undef DECLARE_GPU_SPEC
 #undef DECLARE_GPU_SPEC_DIM
 }  // namespace functor
@@ -147,11 +147,11 @@ DECLARE_GPU_SPEC(double);
                               .TypeConstraint<T>("T") \
                               .HostMemory("dims"),    \
                           ReverseOp<GPUDevice, T>)
-REGISTER_GPU_KERNEL(uint8);
-REGISTER_GPU_KERNEL(int8);
-REGISTER_GPU_KERNEL(Eigen::half);
-REGISTER_GPU_KERNEL(float);
-REGISTER_GPU_KERNEL(double);
+TF_CALL_uint8(REGISTER_GPU_KERNEL);
+TF_CALL_int8(REGISTER_GPU_KERNEL);
+TF_CALL_half(REGISTER_GPU_KERNEL);
+TF_CALL_float(REGISTER_GPU_KERNEL);
+TF_CALL_double(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index 67a8a90c2fb..0acde9c498b 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -118,21 +118,16 @@ class LinSpaceOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("LinSpace")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("start")
-                            .HostMemory("stop")
-                            .HostMemory("num")
-                            .HostMemory("output"),
-                        LinSpaceOp<float>);
-REGISTER_KERNEL_BUILDER(Name("LinSpace")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T")
-                            .HostMemory("start")
-                            .HostMemory("stop")
-                            .HostMemory("num")
-                            .HostMemory("output"),
-                        LinSpaceOp<double>);
+#define REGISTER_CPU_KERNEL(T)                        \
+  REGISTER_KERNEL_BUILDER(Name("LinSpace")            \
+                              .Device(DEVICE_CPU)     \
+                              .TypeConstraint<T>("T") \
+                              .HostMemory("start")    \
+                              .HostMemory("stop")     \
+                              .HostMemory("num")      \
+                              .HostMemory("output"),  \
+                          LinSpaceOp<T>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index 82376862ca6..8ec8409e21d 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/softmax_op.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
@@ -40,27 +41,22 @@ struct SoftmaxFunctor<CPUDevice, T> {
 };
 }  // namespace functor
 
-REGISTER_KERNEL_BUILDER(
-    Name("Softmax").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
-    SoftmaxOp<CPUDevice, Eigen::half>);
-REGISTER_KERNEL_BUILDER(Name("Softmax")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        SoftmaxOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Softmax")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T"),
-                        SoftmaxOp<CPUDevice, double>);
-REGISTER_KERNEL_BUILDER(
-    Name("LogSoftmax").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"),
-    SoftmaxOp<CPUDevice, Eigen::half>);
-REGISTER_KERNEL_BUILDER(
-    Name("LogSoftmax").Device(DEVICE_CPU).TypeConstraint<float>("T"),
-    SoftmaxOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("LogSoftmax")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T"),
-                        SoftmaxOp<CPUDevice, double>);
+#define REGISTER_CPU(T)                                          \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("Softmax").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      SoftmaxOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
+
+#undef REGISTER_CPU
+#define REGISTER_CPU(T)                                             \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("LogSoftmax").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      SoftmaxOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index d10cbad1d20..613c6a15c5b 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -157,10 +157,15 @@ EIGEN_STRONG_INLINE Packet8f pinterleave4x64<Packet8f>(const Packet8f& from) {
   return _mm256_castsi256_ps(_mm256_permute4x64_epi64(_mm256_castps_si256(from),
                                                       _MM_SHUFFLE(3, 1, 2, 0)));
 #else
-  __int64_t tmp1 = _mm256_extract_epi64(_mm256_castps_si256(from), 1);
-  __int64_t tmp2 = _mm256_extract_epi64(_mm256_castps_si256(from), 2);
-  __m256i tmp3 = _mm256_insert_epi64(_mm256_castps_si256(from), tmp1, 2);
-  return _mm256_castsi256_ps(_mm256_insert_epi64(tmp3, tmp2, 1));
+  auto tmp1 = _mm256_extract_epi32(_mm256_castps_si256(from), 2);
+  auto tmp2 = _mm256_extract_epi32(_mm256_castps_si256(from), 3);
+  auto tmp3 = _mm256_extract_epi32(_mm256_castps_si256(from), 4);
+  auto tmp4 = _mm256_extract_epi32(_mm256_castps_si256(from), 5);
+  auto tmp5 = _mm256_insert_epi32(_mm256_castps_si256(from), tmp1, 4);
+  tmp5 = _mm256_insert_epi32(tmp5, tmp2, 5);
+  tmp5 = _mm256_insert_epi32(tmp5, tmp3, 2);
+  tmp5 = _mm256_insert_epi32(tmp5, tmp4, 3);
+  return _mm256_castsi256_ps(tmp5);
 #endif
 }
 // Return a Packet with 4 floats loaded from 4 bfloat16 values
diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc
index cb885808000..45cad2e23b1 100644
--- a/tensorflow/core/kernels/sparse_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@@ -238,25 +238,25 @@ class SparseMatmulOpTest : public ::testing::Test {
 
 TEST_F(SparseMatmulOpTest, BroadcastPacketTest) {
   for (int i = 0; i < PacketSize; ++i) ref[i] = data1[0];
-  internal::pstore(data2, internal::pbroadcast_first<Packet>(
-                              internal::pload<Packet>(data1)));
+  internal::pstoreu(data2, internal::pbroadcast_first<Packet>(
+                               internal::ploadu<Packet>(data1)));
   ASSERT_TRUE(areApprox(ref, data2, PacketSize));
   if (PacketSize > 1) {
     for (int i = 0; i < PacketSize; ++i) ref[i] = data1[1];
-    internal::pstore(data2, internal::pbroadcast_second<Packet>(
-                                internal::pload<Packet>(data1)));
+    internal::pstoreu(data2, internal::pbroadcast_second<Packet>(
+                                 internal::ploadu<Packet>(data1)));
     ASSERT_TRUE(areApprox(ref, data2, PacketSize));
 
     if (PacketSize > 2) {
       for (int i = 0; i < PacketSize; ++i) ref[i] = data1[2];
-      internal::pstore(data2, internal::pbroadcast_third<Packet>(
-                                  internal::pload<Packet>(data1)));
+      internal::pstoreu(data2, internal::pbroadcast_third<Packet>(
+                                   internal::ploadu<Packet>(data1)));
       ASSERT_TRUE(areApprox(ref, data2, PacketSize));
 
       if (PacketSize > 3) {
         for (int i = 0; i < PacketSize; ++i) ref[i] = data1[3];
-        internal::pstore(data2, internal::pbroadcast_fourth<Packet>(
-                                    internal::pload<Packet>(data1)));
+        internal::pstoreu(data2, internal::pbroadcast_fourth<Packet>(
+                                     internal::ploadu<Packet>(data1)));
         ASSERT_TRUE(areApprox(ref, data2, PacketSize));
       }
     }
@@ -276,8 +276,8 @@ TEST_F(SparseMatmulOpTest, InterleavePacketTest) {
     for (int i = 0; i < PacketSize; ++i) ref[i] = data1[i];
   }
 
-  internal::pstore(
-      data2, internal::pinterleave4x64<Packet>(internal::pload<Packet>(data1)));
+  internal::pstoreu(data2, internal::pinterleave4x64<Packet>(
+                               internal::ploadu<Packet>(data1)));
   ASSERT_TRUE(areApprox(ref, data2, PacketSize));
 }
 
@@ -294,8 +294,8 @@ TEST_F(SparseMatmulOpTest, Bfloat16ExpandTest) {
       ref[i] = data3[i];
     }
   }
-  internal::pstore(data2, internal::pexpand_bf16_l<Packet>(
-                              internal::pload<Packet>(data3_bfloat16)));
+  internal::pstoreu(data2, internal::pexpand_bf16_l<Packet>(
+                               internal::ploadu<Packet>(data3_bfloat16)));
   ASSERT_TRUE(areApprox(ref, data2, PacketSize));
 
   if (PacketSize == 8) {  // AVX
@@ -311,18 +311,18 @@ TEST_F(SparseMatmulOpTest, Bfloat16ExpandTest) {
     }
   }
 
-  internal::pstore(data2, internal::pexpand_bf16_u<Packet>(
-                              internal::pload<Packet>(data3_bfloat16)));
+  internal::pstoreu(data2, internal::pexpand_bf16_u<Packet>(
+                               internal::ploadu<Packet>(data3_bfloat16)));
   ASSERT_TRUE(areApprox(ref, data2, PacketSize));
 }
 
 TEST_F(SparseMatmulOpTest, Bfloat16LoadTest) {
   if (PacketSize >= 4) {
     for (int i = 0; i < 4; ++i) ref[i] = data3[i];
-    internal::pstore(data2, internal::pload4bf16<Packet>(data3_bfloat16));
+    internal::pstoreu(data2, internal::pload4bf16<Packet>(data3_bfloat16));
     ASSERT_TRUE(areApprox(ref, data2, 4));
 
-    internal::pstore(data2, internal::pload2bf16<Packet>(data3_bfloat16));
+    internal::pstoreu(data2, internal::pload2bf16<Packet>(data3_bfloat16));
     ASSERT_TRUE(areApprox(ref, data2, 2));
   }
 }
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index 52d2e637d17..5990bfbcf3c 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/type_index.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -92,18 +93,22 @@ class TileOp : public OpKernel {
   HANDLE_DIM(T, 4)     \
   HANDLE_DIM(T, 5)
 
-    HANDLE_TYPE(DT_BOOL);
-    HANDLE_TYPE(DT_FLOAT);
-    HANDLE_TYPE(DT_DOUBLE);
-    HANDLE_TYPE(DT_UINT8);
-    HANDLE_TYPE(DT_INT32);
-    HANDLE_TYPE(DT_INT16);
-    HANDLE_TYPE(DT_INT64);
-    HANDLE_TYPE(DT_HALF);
-    HANDLE_TYPE(DT_COMPLEX64);
-    HANDLE_TYPE(DT_COMPLEX128);
-    HANDLE_TYPE(DT_STRING);  // when DEVICE=CPUDevice.
+#define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum<T>::value)
 
+    // Invoke macro using TF_CALL_* so type-filtering for platform applies.
+    TF_CALL_bool(HANDLE_TYPE_NAME);
+    TF_CALL_float(HANDLE_TYPE_NAME);
+    TF_CALL_double(HANDLE_TYPE_NAME);
+    TF_CALL_uint8(HANDLE_TYPE_NAME);
+    TF_CALL_int32(HANDLE_TYPE_NAME);
+    TF_CALL_int16(HANDLE_TYPE_NAME);
+    TF_CALL_int64(HANDLE_TYPE_NAME);
+    TF_CALL_half(HANDLE_TYPE_NAME);
+    TF_CALL_string(HANDLE_TYPE_NAME);  // when DEVICE=CPUDevice.
+    TF_CALL_complex64(HANDLE_TYPE_NAME);
+    TF_CALL_complex128(HANDLE_TYPE_NAME);
+
+#undef HANDLE_TYPE_NAME
 #undef HANDLE_TYPE
 #undef HANDLE_DIM
 
@@ -165,17 +170,20 @@ inline void TileOp<Device>::HandleCase(
   HANDLE_CASE(device, dtype, 4);       \
   HANDLE_CASE(device, dtype, 5);
 
-HANDLE_CASE_DIM(CPUDevice, DT_BOOL);
-HANDLE_CASE_DIM(CPUDevice, DT_FLOAT);
-HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE);
-HANDLE_CASE_DIM(CPUDevice, DT_UINT8);
-HANDLE_CASE_DIM(CPUDevice, DT_INT32);
-HANDLE_CASE_DIM(CPUDevice, DT_INT16);
-HANDLE_CASE_DIM(CPUDevice, DT_INT64);
-HANDLE_CASE_DIM(CPUDevice, DT_HALF);
-HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX64);
-HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX128);
-HANDLE_CASE_DIM(CPUDevice, DT_STRING);
+#define HANDLE_TYPE_NAME_CPU(T) \
+  HANDLE_CASE_DIM(CPUDevice, DataTypeToEnum<T>::value);
+
+TF_CALL_bool(HANDLE_TYPE_NAME_CPU);
+TF_CALL_float(HANDLE_TYPE_NAME_CPU);
+TF_CALL_double(HANDLE_TYPE_NAME_CPU);
+TF_CALL_uint8(HANDLE_TYPE_NAME_CPU);
+TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
+TF_CALL_int16(HANDLE_TYPE_NAME_CPU);
+TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
+TF_CALL_half(HANDLE_TYPE_NAME_CPU);
+TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
+TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
+TF_CALL_string(HANDLE_TYPE_NAME_CPU);
 
 #if GOOGLE_CUDA
 HANDLE_CASE_DIM(GPUDevice, DT_FLOAT);
@@ -186,6 +194,7 @@ HANDLE_CASE_DIM(GPUDevice, DT_INT64);
 HANDLE_CASE_DIM(GPUDevice, DT_HALF);
 #endif  // GOOGLE_CUDA
 
+#undef HANDLE_TYPE_NAME_CPU
 #undef HANDLE_CASE_DIM
 #undef HANDLE_CASE
 
@@ -249,13 +258,16 @@ class TileGradientOp : public OpKernel {
   HANDLE_DIM(T, 4)     \
   HANDLE_DIM(T, 5)
 
-    HANDLE_TYPE(DT_FLOAT);
-    HANDLE_TYPE(DT_DOUBLE);
-    HANDLE_TYPE(DT_INT32);
-    HANDLE_TYPE(DT_INT16);
-    HANDLE_TYPE(DT_INT64);
-    HANDLE_TYPE(DT_HALF);
+#define HANDLE_TYPE_NAME(T) HANDLE_TYPE(DataTypeToEnum<T>::value)
 
+    TF_CALL_float(HANDLE_TYPE_NAME);
+    TF_CALL_double(HANDLE_TYPE_NAME);
+    TF_CALL_int32(HANDLE_TYPE_NAME);
+    TF_CALL_int16(HANDLE_TYPE_NAME);
+    TF_CALL_int64(HANDLE_TYPE_NAME);
+    TF_CALL_half(HANDLE_TYPE_NAME);
+
+#undef HANDLE_TYPE_NAME
 #undef HANDLE_TYPE
 #undef HANDLE_DIM
 
@@ -390,14 +402,17 @@ inline void TileGradientOp<Device>::HandleCase(
   HANDLE_CASE(device, dtype, 4);       \
   HANDLE_CASE(device, dtype, 5);
 
-HANDLE_CASE_DIM(CPUDevice, DT_FLOAT);
-HANDLE_CASE_DIM(CPUDevice, DT_DOUBLE);
-HANDLE_CASE_DIM(CPUDevice, DT_INT16);
-HANDLE_CASE_DIM(CPUDevice, DT_INT32);
-HANDLE_CASE_DIM(CPUDevice, DT_INT64);
-HANDLE_CASE_DIM(CPUDevice, DT_HALF);
-HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX64);
-HANDLE_CASE_DIM(CPUDevice, DT_COMPLEX128);
+#define HANDLE_TYPE_NAME_CPU(T) \
+  HANDLE_CASE_DIM(CPUDevice, DataTypeToEnum<T>::value);
+
+TF_CALL_float(HANDLE_TYPE_NAME_CPU);
+TF_CALL_double(HANDLE_TYPE_NAME_CPU);
+TF_CALL_int16(HANDLE_TYPE_NAME_CPU);
+TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
+TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
+TF_CALL_half(HANDLE_TYPE_NAME_CPU);
+TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
+TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
 
 #if GOOGLE_CUDA
 HANDLE_CASE_DIM(GPUDevice, DT_FLOAT);
@@ -409,6 +424,7 @@ HANDLE_CASE_DIM(GPUDevice, DT_HALF);
 
 #endif  // GOOGLE_CUDA
 
+#undef HANDLE_TYPE_NAME_CPU
 #undef HANDLE_CASE_DIM
 #undef HANDLE_CASE
 
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index f00e7ef8742..b16c9c860a9 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -16,17 +16,28 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/training_ops.h"
+#include <algorithm>
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-namespace functor {
+namespace {
+template <class T>
+inline T sgn(const T x) {
+  T zero(0);
+  T one(1);
+  return (x == zero ? zero : (x < zero ? -one : one));
+}
+}
 
+namespace functor {
 template <typename T>
 struct ApplyGradientDescent<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@@ -56,6 +67,34 @@ struct ApplyAdadelta<CPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyProximalGradientDescent<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad) {
+    // Note that here is Fobos update, for details please refer:
+    // http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf
+    // TODO(xbing): merge the logic for ProximalGradientDescent and
+    // ProximalAdagrad.
+    auto prox_var = var;
+    // compute v = w - lr * grad.
+    prox_var.device(d) -= grad * lr();
+    if (l1() > 0) {
+      var.device(d) = prox_var.abs() - var.constant(lr() * l1());
+      // compute sign(v) * max(|v| - lr * l1, 0)
+      var.device(d) = prox_var.sign() * var.cwiseMax(T(0.0));
+    } else {
+      var.device(d) = prox_var;
+    }
+    if (l2() > 0) {
+      // compute v / (1.0 + l2 * lr)
+      var.device(d) = var / (var.constant(1.0) + var.constant(l2() * lr()));
+    }
+  }
+};
+
 template <typename T>
 struct ApplyAdagrad<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@@ -67,6 +106,35 @@ struct ApplyAdagrad<CPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyProximalAdagrad<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad) {
+    // Fobos update per paper with Adagrad learning rate.
+    accum.device(d) += grad.square();
+    // Adagrad learning rate.
+    auto learning_rate = accum.constant(lr()) * accum.rsqrt();
+    auto prox_var = var;
+    // compute v = w - lr * grad.
+    prox_var.device(d) -= grad * learning_rate;
+    if (l1() > 0) {
+      var.device(d) = prox_var.abs() - learning_rate * prox_var.constant(l1());
+      // compute sign(v) * max(|v| - lr * l1, 0)
+      var.device(d) = prox_var.sign() * var.cwiseMax(T(0.0));
+    } else {
+      var.device(d) = prox_var;
+    }
+    if (l2() > 0) {
+      var.device(d) =
+          var / (var.constant(1.0) + var.constant(l2()) * learning_rate);
+    }
+  }
+};
+
 template <typename T>
 struct ApplyFtrl<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@@ -221,10 +289,11 @@ class ApplyGradientDescentOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("ApplyGradientDescent").Device(DEVICE_##D).TypeConstraint<T>("T"), \
       ApplyGradientDescentOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-REGISTER_KERNELS(CPU, Eigen::half);
-REGISTER_KERNELS(CPU, float);
-REGISTER_KERNELS(CPU, double);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
 
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
@@ -246,6 +315,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
 #endif
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
 template <typename Device, typename T>
@@ -347,10 +417,11 @@ typedef Eigen::GpuDevice GPUDevice;
   REGISTER_KERNEL_BUILDER(                                             \
       Name("ApplyAdadelta").Device(DEVICE_##D).TypeConstraint<T>("T"), \
       ApplyAdadeltaOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-REGISTER_KERNELS(CPU, Eigen::half);
-REGISTER_KERNELS(CPU, float);
-REGISTER_KERNELS(CPU, double);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
 
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
@@ -374,6 +445,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
 #endif
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
 // Note, this op works on cpu only.
@@ -483,7 +555,6 @@ class SparseApplyAdadeltaOp : public OpKernel {
         accum_update_ =
             accum_update_ * accum_update_.constant(rho_scalar) +
             update.square() * update.constant(static_cast<T>(1) - rho_scalar);
-
         auto v = var_flat.template chip<0>(index);
         v -= update * update.constant(lr_scalar);
       }
@@ -505,14 +576,213 @@ class SparseApplyAdadeltaOp : public OpKernel {
                               .TypeConstraint<T>("T")                \
                               .TypeConstraint<Tindices>("Tindices"), \
                           SparseApplyAdadeltaOp<T, Tindices>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename Device, typename T>
+class ApplyProximalGradientDescentOp : public OpKernel {
+ public:
+  explicit ApplyProximalGradientDescentOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    const Tensor& alpha = ctx->input(1);
+    OP_REQUIRES(ctx, IsLegacyScalar(alpha.shape()),
+                errors::InvalidArgument("alpha is not a scalar: ",
+                                        alpha.shape().DebugString()));
+    const Tensor& l1 = ctx->input(2);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l1.shape()),
+        errors::InvalidArgument("l1 regularization strength is not a scalar: ",
+                                l1.shape().DebugString()));
+    const Tensor& l2 = ctx->input(3);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l2.shape()),
+        errors::InvalidArgument("l2 regularization strength is not a scalar: ",
+                                l2.shape().DebugString()));
+
+    const Tensor& delta = ctx->input(4);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(delta.shape()),
+        errors::InvalidArgument("var and delta do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                delta.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyProximalGradientDescent<Device, T>()(
+        device, var.flat<T>(), alpha.scalar<T>(), l1.scalar<T>(),
+        l2.scalar<T>(), delta.flat<T>());
+
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                 \
+  REGISTER_KERNEL_BUILDER(Name("ApplyProximalGradientDescent") \
+                              .Device(DEVICE_##D)              \
+                              .TypeConstraint<T>("T"),         \
+                          ApplyProximalGradientDescentOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename T, typename Tindex>
+class SparseApplyProximalGradientDescentOp : public OpKernel {
+ public:
+  explicit SparseApplyProximalGradientDescentOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
+                errors::InvalidArgument("var must be at least 1 dimensional"));
+
+    const Tensor& lr = ctx->input(1);
+    OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& l1 = ctx->input(2);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l1.shape()),
+        errors::InvalidArgument("l1 regularization strength is not a scalar: ",
+                                l1.shape().DebugString()));
+    const Tensor& l2 = ctx->input(3);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l2.shape()),
+        errors::InvalidArgument("l2 regularization strength is not a scalar: ",
+                                l2.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(4);
+    const Tensor& indices = ctx->input(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    int64 inner_dim = 1;
+    for (int d = 1; d < var.dims(); d++) {
+      OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d)));
+      inner_dim *= grad.dim_size(d);
+    }
+    const Tindex N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    if (N > 0) {
+      if (inner_dim > 1) {
+        const Tindex first_dim_size = var.dim_size(0);
+        auto indices_vec = indices.vec<Tindex>();
+        auto var_flat = var.flat_outer_dims<T>();
+        auto grad_flat = grad.flat_outer_dims<T>();
+        T lr_scalar = lr.scalar<T>()();
+        T l1_scalar = l1.scalar<T>()();
+        T l2_scalar = l2.scalar<T>()();
+
+        // TODO(xbing): extract the common logic for the Fobos update.
+        for (Tindex i = 0; i < N; i++) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                      errors::InvalidArgument(
+                          strings::StrCat("Index ", index, " at offset ", i,
+                                          " in indices is out of range")));
+          auto g = grad_flat.template chip<0>(i);
+          auto v = var_flat.template chip<0>(index);
+          // compute learning_rate for current step.
+          auto learning_rate = v.constant(lr_scalar);
+          auto prox_v = v;
+          // v = w - g * learning_rate.
+          prox_v -= g * learning_rate;
+          if (l1_scalar > 0) {
+            v = prox_v.abs() - learning_rate * prox_v.constant(l1_scalar);
+            // compute sign(v) * max(|v|, 0)
+            v = prox_v.sign() * v.cwiseMax(static_cast<T>(0.0));
+          } else {
+            v = prox_v;
+          }
+          if (l2_scalar > 0) {
+            v /= (v.constant(1.0) + v.constant(l2_scalar) * learning_rate);
+          }
+        }
+      } else {
+        CHECK_EQ(1, inner_dim);
+        auto indices_vec = indices.vec<Tindex>();
+        auto var_flat = var.flat<T>();
+        auto grad_flat = grad.flat<T>();
+        T lr_scalar = lr.scalar<T>()();
+        T l1_scalar = l1.scalar<T>()();
+        T l2_scalar = l2.scalar<T>()();
+        const Tindex first_dim_size = var_flat.size();
+
+        for (Tindex i = 0; i < N; i++) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                      errors::InvalidArgument(
+                          strings::StrCat("Index ", index, " at offset ", i,
+                                          " in indices is out of range")));
+          const T& g = grad_flat(i);
+          auto learning_rate = lr_scalar;
+          auto prox_v = var_flat(index);
+          prox_v -= learning_rate * g;
+          if (l1_scalar > 0) {
+            var_flat(index) = std::abs(prox_v) - learning_rate * l1_scalar;
+            var_flat(index) =
+                sgn(prox_v) * std::max(var_flat(index), static_cast<T>(0.0));
+          } else {
+            var_flat(index) = prox_v;
+          }
+          if (l2_scalar > 0) {
+            var_flat(index) /= (1.0 + l2_scalar * learning_rate);
+          }
+        }
+      }
+    }
+
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(T, Tindices)                                \
+  REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalGradientDescent") \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyProximalGradientDescentOp<T, Tindices>);
 
-REGISTER_KERNELS(Eigen::half, int32);
-REGISTER_KERNELS(Eigen::half, int64);
 REGISTER_KERNELS(float, int32);
 REGISTER_KERNELS(float, int64);
 REGISTER_KERNELS(double, int32);
 REGISTER_KERNELS(double, int64);
-
 #undef REGISTER_KERNELS
 
 template <typename Device, typename T>
@@ -568,10 +838,11 @@ typedef Eigen::GpuDevice GPUDevice;
   REGISTER_KERNEL_BUILDER(                                            \
       Name("ApplyAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
       ApplyAdagradOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-REGISTER_KERNELS(CPU, Eigen::half);
-REGISTER_KERNELS(CPU, float);
-REGISTER_KERNELS(CPU, double);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
 
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
@@ -593,15 +864,80 @@ REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
 #endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+template <typename Device, typename T>
+class ApplyProximalAdagradOp : public OpKernel {
+ public:
+  explicit ApplyProximalAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(1)));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& l1 = ctx->input(3);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l1.shape()),
+        errors::InvalidArgument("l1 regularization strength is not a scalar: ",
+                                l1.shape().DebugString()));
+    const Tensor& l2 = ctx->input(4);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l2.shape()),
+        errors::InvalidArgument("l2 regularization strength is not a scalar: ",
+                                l2.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(5);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyProximalAdagrad<Device, T>()(
+        device, var.flat<T>(), accum.flat<T>(), lr.scalar<T>(), l1.scalar<T>(),
+        l2.scalar<T>(), grad.flat<T>());
+
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_KERNELS(D, T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("ApplyProximalAdagrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyProximalAdagradOp<D##Device, T>);
+
+REGISTER_KERNELS(CPU, float);
+REGISTER_KERNELS(CPU, double);
 #undef REGISTER_KERNELS
 
 namespace {
-template <class T>
-inline T sgn(const T x) {
-  T zero(0);
-  T one(1);
-  return (x == zero ? zero : (x < zero ? -one : one));
-}
 
 template <typename T>
 inline T FtrlCompute(const T& accum, const T& linear, const T& lr, const T& l1,
@@ -730,9 +1066,167 @@ class SparseApplyAdagradOp : public OpKernel {
                               .TypeConstraint<T>("T")                \
                               .TypeConstraint<Tindices>("Tindices"), \
                           SparseApplyAdagradOp<T, Tindices>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename T, typename Tindex>
+class SparseApplyProximalAdagradOp : public OpKernel {
+ public:
+  explicit SparseApplyProximalAdagradOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    Tensor var = ctx->mutable_input(0, use_exclusive_lock_);
+    Tensor accum = ctx->mutable_input(1, use_exclusive_lock_);
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(1)));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
+                errors::InvalidArgument("var must be at least 1 dimensional"));
+
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, IsLegacyScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& l1 = ctx->input(3);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l1.shape()),
+        errors::InvalidArgument("l1 regularization strength is not a scalar: ",
+                                l1.shape().DebugString()));
+    const Tensor& l2 = ctx->input(4);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(l2.shape()),
+        errors::InvalidArgument("l2 regularization strength is not a scalar: ",
+                                l2.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(5);
+    const Tensor& indices = ctx->input(6);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    int64 inner_dim = 1;
+    for (int d = 1; d < var.dims(); d++) {
+      OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d)));
+      inner_dim *= grad.dim_size(d);
+    }
+    const Tindex N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    if (N > 0) {
+      if (inner_dim > 1) {
+        const Tindex first_dim_size = var.dim_size(0);
+        auto indices_vec = indices.vec<Tindex>();
+        auto var_flat = var.flat_outer_dims<T>();
+        auto accum_flat = accum.flat_outer_dims<T>();
+        auto grad_flat = grad.flat_outer_dims<T>();
+        T lr_scalar = lr.scalar<T>()();
+        T l1_scalar = l1.scalar<T>()();
+        T l2_scalar = l2.scalar<T>()();
+
+        for (Tindex i = 0; i < N; i++) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                      errors::InvalidArgument(
+                          strings::StrCat("Index ", index, " at offset ", i,
+                                          " in indices is out of range")));
+          auto a = accum_flat.template chip<0>(index);
+          auto g = grad_flat.template chip<0>(i);
+          auto v = var_flat.template chip<0>(index);
+          a += g.square();
+          // compute learning_rate for current step.
+          auto learning_rate = a.constant(lr_scalar) * a.rsqrt();
+          auto prox_v = v;
+          // v = w - g * learning_rate.
+          prox_v -= g * learning_rate;
+          if (l1_scalar > 0) {
+            v = prox_v.abs() - learning_rate * prox_v.constant(l1_scalar);
+            // compute sign(v) * max(|v|, 0)
+            v = prox_v.sign() * v.cwiseMax(static_cast<T>(0.0));
+          } else {
+            v = prox_v;
+          }
+          if (l2_scalar > 0) {
+            v /= (v.constant(1.0) + v.constant(l2_scalar) * learning_rate);
+          }
+        }
+      } else {
+        CHECK_EQ(1, inner_dim);
+        auto indices_vec = indices.vec<Tindex>();
+        auto var_flat = var.flat<T>();
+        auto accum_flat = accum.flat<T>();
+        auto grad_flat = grad.flat<T>();
+        T lr_scalar = lr.scalar<T>()();
+        T l1_scalar = l1.scalar<T>()();
+        T l2_scalar = l2.scalar<T>()();
+        const Tindex first_dim_size = accum_flat.size();
+
+        for (Tindex i = 0; i < N; i++) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                      errors::InvalidArgument(
+                          strings::StrCat("Index ", index, " at offset ", i,
+                                          " in indices is out of range")));
+          T& a = accum_flat(index);
+          const T& g = grad_flat(i);
+          a += g * g;
+          auto learning_rate = lr_scalar / std::sqrt(a);
+          auto prox_v = var_flat(index);
+          prox_v -= learning_rate * g;
+          if (l1_scalar > 0) {
+            var_flat(index) = std::abs(prox_v) - learning_rate * l1_scalar;
+            var_flat(index) =
+                sgn(prox_v) * std::max(var_flat(index), static_cast<T>(0.0));
+          } else {
+            var_flat(index) = prox_v;
+          }
+          if (l2_scalar > 0) {
+            var_flat(index) /= (1.0 + l2_scalar * learning_rate);
+          }
+        }
+      }
+    }
+
+    ctx->forward_ref_input_to_ref_output(0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(T, Tindices)                                \
+  REGISTER_KERNEL_BUILDER(Name("SparseApplyProximalAdagrad")         \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyProximalAdagradOp<T, Tindices>);
 
-REGISTER_KERNELS(Eigen::half, int32);
-REGISTER_KERNELS(Eigen::half, int64);
 REGISTER_KERNELS(float, int32);
 REGISTER_KERNELS(float, int64);
 REGISTER_KERNELS(double, int32);
@@ -822,10 +1316,13 @@ typedef Eigen::GpuDevice GPUDevice;
   REGISTER_KERNEL_BUILDER(                                         \
       Name("ApplyFtrl").Device(DEVICE_##D).TypeConstraint<T>("T"), \
       ApplyFtrlOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-REGISTER_KERNELS(CPU, Eigen::half);
-REGISTER_KERNELS(CPU, float);
-REGISTER_KERNELS(CPU, double);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
 // Note, this op works on cpu only.
@@ -1002,13 +1499,15 @@ class SparseApplyFtrlOp : public OpKernel {
                               .TypeConstraint<T>("T")                \
                               .TypeConstraint<Tindices>("Tindices"), \
                           SparseApplyFtrlOp<CPUDevice, T, Tindices>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);
 
-REGISTER_KERNELS(Eigen::half, int32);
-REGISTER_KERNELS(Eigen::half, int64);
-REGISTER_KERNELS(float, int32);
-REGISTER_KERNELS(float, int64);
-REGISTER_KERNELS(double, int32);
-REGISTER_KERNELS(double, int64);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
 template <typename Device, typename T>
@@ -1070,10 +1569,11 @@ typedef Eigen::GpuDevice GPUDevice;
   REGISTER_KERNEL_BUILDER(                                             \
       Name("ApplyMomentum").Device(DEVICE_##D).TypeConstraint<T>("T"), \
       ApplyMomentumOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-REGISTER_KERNELS(CPU, Eigen::half);
-REGISTER_KERNELS(CPU, float);
-REGISTER_KERNELS(CPU, double);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
 
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
@@ -1096,6 +1596,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
 #endif
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
 // Note, this op works on cpu only.
@@ -1129,7 +1630,7 @@ class SparseApplyMomentumOp : public OpKernel {
 
     const Tensor& lr = ctx->input(2);
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
-                errors::InvalidArgument("lr is not a scalar: ",
+                errors::InvalidArgument("lr is not a scalar : ",
                                         lr.shape().DebugString()));
     const Tensor& grad = ctx->input(3);
     const Tensor& indices = ctx->input(4);
@@ -1188,13 +1689,15 @@ class SparseApplyMomentumOp : public OpKernel {
                               .TypeConstraint<T>("T")                \
                               .TypeConstraint<Tindices>("Tindices"), \
                           SparseApplyMomentumOp<T, Tindices>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);
 
-REGISTER_KERNELS(Eigen::half, int32);
-REGISTER_KERNELS(Eigen::half, int64);
-REGISTER_KERNELS(float, int32);
-REGISTER_KERNELS(float, int64);
-REGISTER_KERNELS(double, int32);
-REGISTER_KERNELS(double, int64);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
 template <typename Device, typename T>
@@ -1237,7 +1740,7 @@ class ApplyAdamOp : public OpKernel {
                 errors::InvalidArgument("beta2_power is not a scalar: ",
                                         beta2_power.shape().DebugString()));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
-                errors::InvalidArgument("lr is not a scalar: ",
+                errors::InvalidArgument("lr is not a scalar : ",
                                         lr.shape().DebugString()));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
                 errors::InvalidArgument("beta1 is not a scalar: ",
@@ -1285,10 +1788,11 @@ typedef Eigen::GpuDevice GPUDevice;
   REGISTER_KERNEL_BUILDER(                                         \
       Name("ApplyAdam").Device(DEVICE_##D).TypeConstraint<T>("T"), \
       ApplyAdamOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-REGISTER_KERNELS(CPU, Eigen::half);
-REGISTER_KERNELS(CPU, float);
-REGISTER_KERNELS(CPU, double);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
 
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
@@ -1316,6 +1820,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
 #endif
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
 template <typename Device, typename T>
@@ -1352,7 +1857,7 @@ class ApplyRMSPropOp : public OpKernel {
     const Tensor& grad = ctx->input(7);
 
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
-                errors::InvalidArgument("lr is not a scalar: ",
+                errors::InvalidArgument("lr is not a scalar : ",
                                         lr.shape().DebugString()));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(rho.shape()),
                 errors::InvalidArgument("rho is not a scalar: ",
@@ -1400,10 +1905,11 @@ typedef Eigen::GpuDevice GPUDevice;
   REGISTER_KERNEL_BUILDER(                                            \
       Name("ApplyRMSProp").Device(DEVICE_##D).TypeConstraint<T>("T"), \
       ApplyRMSPropOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
 
-REGISTER_KERNELS(CPU, Eigen::half);
-REGISTER_KERNELS(CPU, float);
-REGISTER_KERNELS(CPU, double);
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
 
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
@@ -1428,6 +1934,7 @@ REGISTER_KERNELS(GPU, Eigen::half);
 REGISTER_KERNELS(GPU, float);
 REGISTER_KERNELS(GPU, double);
 #endif
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index 7b4291cccc6..b9946cd9228 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -44,6 +44,24 @@ struct ApplyAdadelta {
                   typename TTypes<T>::ConstFlat grad);
 };
 
+template <typename Device, typename T>
+struct FobosElasticNet {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyProximalGradientDescent {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
 template <typename Device, typename T>
 struct ApplyAdagrad {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -52,6 +70,16 @@ struct ApplyAdagrad {
                   typename TTypes<T>::ConstFlat grad);
 };
 
+template <typename Device, typename T>
+struct ApplyProximalAdagrad {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
 template <typename Device, typename T>
 struct ApplyFtrl {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index de83c25f06e..639bad5f04f 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/xent_op.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
@@ -86,18 +87,14 @@ struct XentFunctor<CPUDevice, T> {
 };
 }  // namespace functor
 
-REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<Eigen::half>("T"),
-                        SoftmaxXentWithLogitsOp<CPUDevice, Eigen::half>);
-REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        SoftmaxXentWithLogitsOp<CPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<double>("T"),
-                        SoftmaxXentWithLogitsOp<CPUDevice, double>);
+#define REGISTER_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits") \
+                              .Device(DEVICE_CPU)               \
+                              .TypeConstraint<T>("T"),          \
+                          SoftmaxXentWithLogitsOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
diff --git a/tensorflow/core/ops/compat/ops_history.v0.pbtxt b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
index ed60c227a5f..c20e8c36001 100644
--- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
@@ -1825,6 +1825,127 @@ op {
     }
   }
 }
+op {
+  name: "ApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ApplyRMSProp"
   input_arg {
@@ -3011,6 +3132,63 @@ op {
     }
   }
 }
+op {
+  name: "AvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "AvgPool3D"
   input_arg {
@@ -3232,6 +3410,67 @@ op {
     }
   }
 }
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "BatchCholesky"
   input_arg {
@@ -11801,6 +12040,124 @@ op {
     }
   }
 }
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
 op {
   name: "MaxPool3D"
   input_arg {
@@ -12014,6 +12371,73 @@ op {
     }
   }
 }
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
 op {
   name: "MaxPoolGradWithArgmax"
   input_arg {
@@ -12065,6 +12489,70 @@ op {
     }
   }
 }
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
 op {
   name: "MaxPoolWithArgmax"
   input_arg {
@@ -12115,6 +12603,69 @@ op {
     }
   }
 }
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
 op {
   name: "Maximum"
   input_arg {
@@ -20499,6 +21050,155 @@ op {
     }
   }
 }
+op {
+  name: "SparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "SparseConcat"
   input_arg {
diff --git a/tensorflow/core/ops/nn_grad.cc b/tensorflow/core/ops/nn_grad.cc
index c1a42e74beb..e3b876b2401 100644
--- a/tensorflow/core/ops/nn_grad.cc
+++ b/tensorflow/core/ops/nn_grad.cc
@@ -154,22 +154,25 @@ Status MaxPoolGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   *g = FDH::Define(
     // Arg defs
-    {"input: float", "grad: float"},
+    {"input: T", "grad: T"},
     // Ret val defs
-    {"output: float"},
+    {"output: T"},
     // Attr defs
-    {"ksize: list(int) >= 4",
+    {"T: {float, half} = DT_FLOAT",
+     "ksize: list(int) >= 4",
      "strides: list(int) >= 4",
      GetPaddingAttrString()},
     // Nodes
     {
       // Invoke MaxPool again to recompute the outputs (removed by CSE?).
       {{"maxpool"}, "MaxPool", {"input"},
-       /*Attrs=*/{{"ksize", "$ksize"},
+       /*Attrs=*/{{"T", "$T"},
+                  {"ksize", "$ksize"},
                   {"strides", "$strides"},
                   {"padding", "$padding"}}},
       {{"output"}, "MaxPoolGrad", {"input", "maxpool", "grad"},
-       /*Attrs=*/{{"ksize", "$ksize"},
+       /*Attrs=*/{{"T", "$T"},
+                  {"ksize", "$ksize"},
                   {"strides", "$strides"},
                   {"padding", "$padding"}}}
     });
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index fee145be538..b53945a4a0b 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -28,7 +28,7 @@ REGISTER_OP("AvgPool")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {float, double}")
+    .Attr("T: {float, half, double}")
     .Doc(R"doc(
 Performs average pooling on the input.
 
@@ -55,7 +55,7 @@ REGISTER_OP("AvgPoolGrad")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {float, double}")
+    .Attr("T: {float, half, double}")
     .Doc(R"doc(
 Computes gradients of the average pooling function.
 
@@ -642,12 +642,13 @@ output: The gradients for LRN.
 // --------------------------------------------------------------------------
 
 REGISTER_OP("MaxPool")
+    .Attr("T: {float, half} = DT_FLOAT")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Input("input: float")
-    .Output("output: float")
+    .Input("input: T")
+    .Output("output: T")
     .Doc(R"doc(
 Performs max pooling on the input.
 
@@ -669,10 +670,11 @@ REGISTER_OP("MaxPoolGrad")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Input("orig_input: float")
-    .Input("orig_output: float")
-    .Input("grad: float")
-    .Output("output: float")
+    .Input("orig_input: T")
+    .Input("orig_output: T")
+    .Input("grad: T")
+    .Output("output: T")
+    .Attr("T: {float, half} = DT_FLOAT")
     .Doc(R"doc(
 Computes gradients of the maxpooling function.
 
@@ -696,9 +698,10 @@ REGISTER_OP("MaxPoolWithArgmax")
     .Attr("strides: list(int) >= 4")
     .Attr("Targmax: {int32, int64} = DT_INT64")
     .Attr(GetPaddingAttrString())
-    .Input("input: float")
-    .Output("output: float")
+    .Input("input: T")
+    .Output("output: T")
     .Output("argmax: Targmax")
+    .Attr("T: {float, half} = DT_FLOAT")
     .Doc(R"doc(
 Performs max pooling on the input and outputs both max values and indices.
 
@@ -720,10 +723,11 @@ REGISTER_OP("MaxPoolGradWithArgmax")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr("Targmax: {int32, int64}")
-    .Input("input: float")
-    .Input("grad: float")
+    .Input("input: T")
+    .Input("grad: T")
     .Input("argmax: Targmax")
-    .Output("output: float")
+    .Output("output: T")
+    .Attr("T: {float, half} = DT_FLOAT")
     .Doc(R"doc(
 Computes gradients of the maxpooling function.
 
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 5fb34e79d1a..a8d445c3c4c 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -591,12 +591,12 @@ op {
   }
   input_arg {
     name: "l1"
-    description: "Scaling factor. Must be a scalar."
+    description: "L1 regulariation. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "Scaling factor. Must be a scalar."
+    description: "L2 regulariation. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
@@ -767,6 +767,146 @@ op {
   summary: "Update \'*var\' according to the momentum scheme."
   description: "accum = accum * momentum + grad\nvar -= lr * accum"
 }
+op {
+  name: "ApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    description: "Same as \"var\"."
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Update \'*var\' and \'*accum\' according to FOBOS with Adagrad learning rate."
+  description: "accum += grad * grad\nprox_v = var - lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}"
+}
+op {
+  name: "ApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    description: "The change."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    description: "Same as \"var\"."
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Update \'*var\' as FOBOS algorithm with fixed learning rate."
+  description: "prox_v = var - alpha * delta\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}"
+}
 op {
   name: "ApplyRMSProp"
   input_arg {
@@ -1251,6 +1391,7 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_DOUBLE
       }
     }
@@ -1447,6 +1588,7 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_DOUBLE
       }
     }
@@ -6614,12 +6756,25 @@ op {
   input_arg {
     name: "input"
     description: "4-D input to pool over."
-    type: DT_FLOAT
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     description: "The max pooled output tensor."
-    type: DT_FLOAT
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
   }
   attr {
     name: "ksize"
@@ -6798,22 +6953,22 @@ op {
   input_arg {
     name: "orig_input"
     description: "The original input tensor."
-    type: DT_FLOAT
+    type_attr: "T"
   }
   input_arg {
     name: "orig_output"
     description: "The original output tensor."
-    type: DT_FLOAT
+    type_attr: "T"
   }
   input_arg {
     name: "grad"
     description: "4-D.  Gradients w.r.t. the output of `max_pool`."
-    type: DT_FLOAT
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     description: "Gradients w.r.t. the input to `max_pool`."
-    type: DT_FLOAT
+    type_attr: "T"
   }
   attr {
     name: "ksize"
@@ -6854,6 +7009,19 @@ op {
       }
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
   summary: "Computes gradients of the maxpooling function."
 }
 op {
@@ -6861,12 +7029,12 @@ op {
   input_arg {
     name: "input"
     description: "The original input."
-    type: DT_FLOAT
+    type_attr: "T"
   }
   input_arg {
     name: "grad"
     description: "4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the\noutput of `max_pool`."
-    type: DT_FLOAT
+    type_attr: "T"
   }
   input_arg {
     name: "argmax"
@@ -6876,7 +7044,7 @@ op {
   output_arg {
     name: "output"
     description: "Gradients w.r.t. the input of `max_pool`."
-    type: DT_FLOAT
+    type_attr: "T"
   }
   attr {
     name: "ksize"
@@ -6913,6 +7081,19 @@ op {
       }
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
   summary: "Computes gradients of the maxpooling function."
 }
 op {
@@ -6920,12 +7101,12 @@ op {
   input_arg {
     name: "input"
     description: "4-D with shape `[batch, height, width, channels]`.  Input to pool over."
-    type: DT_FLOAT
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     description: "The max pooled output tensor."
-    type: DT_FLOAT
+    type_attr: "T"
   }
   output_arg {
     name: "argmax"
@@ -6970,6 +7151,19 @@ op {
       }
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
   summary: "Performs max pooling on the input and outputs both max values and indices."
   description: "The indices in `argmax` are flattened, so that a maximum value at position\n`[b, y, x, c]` becomes flattened index\n`((b * height + y) * width + x) * channels + c`."
 }
@@ -11380,12 +11574,12 @@ op {
   }
   input_arg {
     name: "l1"
-    description: "Scaling factor. Must be a scalar."
+    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "Scaling factor. Must be a scalar."
+    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
@@ -11525,6 +11719,176 @@ op {
   summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme."
   description: "That is for rows we have grad for, we update var and accum as follows:\n\naccum = accum * momentum + grad\nvar -= lr * accum"
 }
+op {
+  name: "SparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    description: "Learning rate. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var and accum."
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    description: "Same as \"var\"."
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Sparse update entries in \'*var\' and \'*accum\' according to FOBOS algorithm."
+  description: "That is for rows we have grad for, we update var and accum as follows:\naccum += grad * grad\nprox_v = var\nprox_v -= lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}"
+}
+op {
+  name: "SparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    description: "L1 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    description: "L2 regularization. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    description: "A vector of indices into the first dimension of var and accum."
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "out"
+    description: "Same as \"var\"."
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
+  }
+  summary: "Sparse update \'*var\' as FOBOS algorithm with fixed learning rate."
+  description: "That is for rows we have grad for, we update var as follows:\nprox_v = var - alpha * grad\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}"
+}
 op {
   name: "SparseConcat"
   input_arg {
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index e6a805a8777..5eb011684b4 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -35,6 +35,59 @@ use_locking: If `True`, the subtraction will be protected by a lock;
   otherwise the behavior is undefined, but may exhibit less contention.
 )doc");
 
+REGISTER_OP("ApplyProximalGradientDescent")
+    .Input("var: Ref(T)")
+    .Input("alpha: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("delta: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Doc(R"doc(
+Update '*var' as FOBOS algorithm with fixed learning rate.
+prox_v = var - alpha * delta
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+delta: The change.
+out: Same as "var".
+use_locking: If True, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
+REGISTER_OP("SparseApplyProximalGradientDescent")
+    .Input("var: Ref(T)")
+    .Input("alpha: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .Doc(R"doc(
+Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+
+That is for rows we have grad for, we update var as follows:
+prox_v = var - alpha * grad
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+out: Same as "var".
+use_locking: If True, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
 REGISTER_OP("ApplyAdadelta")
     .Input("var: Ref(T)")
     .Input("accum: Ref(T)")
@@ -117,6 +170,33 @@ use_locking: If `True`, updating of the var and accum tensors will be protected
   contention.
 )doc");
 
+REGISTER_OP("ApplyProximalAdagrad")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Doc(R"doc(
+Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+accum += grad * grad
+prox_v = var - lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+grad: The gradient.
+lr: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+out: Same as "var".
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
 REGISTER_OP("SparseApplyAdagrad")
     .Input("var: Ref(T)")
     .Input("accum: Ref(T)")
@@ -145,6 +225,39 @@ use_locking: If `True`, updating of the var and accum tensors will be protected
   contention.
 )doc");
 
+REGISTER_OP("SparseApplyProximalAdagrad")
+    .Input("var: Ref(T)")
+    .Input("accum: Ref(T)")
+    .Input("lr: T")
+    .Input("l1: T")
+    .Input("l2: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .Doc(R"doc(
+Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+
+That is for rows we have grad for, we update var and accum as follows:
+accum += grad * grad
+prox_v = var
+prox_v -= lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+
+var: Should be from a Variable().
+accum: Should be from a Variable().
+lr: Learning rate. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
+grad: The gradient.
+indices: A vector of indices into the first dimension of var and accum.
+out: Same as "var".
+use_locking: If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
 REGISTER_OP("ApplyFtrl")
     .Input("var: Ref(T)")
     .Input("accum: Ref(T)")
@@ -171,8 +284,8 @@ accum: Should be from a Variable().
 linear: Should be from a Variable().
 grad: The gradient.
 lr: Scaling factor. Must be a scalar.
-l1: Scaling factor. Must be a scalar.
-l2: Scaling factor. Must be a scalar.
+l1: L1 regulariation. Must be a scalar.
+l2: L2 regulariation. Must be a scalar.
 lr_power: Scaling factor. Must be a scalar.
 out: Same as "var".
 use_locking: If `True`, updating of the var and accum tensors will be protected
@@ -210,8 +323,8 @@ linear: Should be from a Variable().
 grad: The gradient.
 indices: A vector of indices into the first dimension of var and accum.
 lr: Scaling factor. Must be a scalar.
-l1: Scaling factor. Must be a scalar.
-l2: Scaling factor. Must be a scalar.
+l1: L1 regularization. Must be a scalar.
+l2: L2 regularization. Must be a scalar.
 lr_power: Scaling factor. Must be a scalar.
 out: Same as "var".
 use_locking: If `True`, updating of the var and accum tensors will be protected
diff --git a/tensorflow/core/util/stat_summarizer.h b/tensorflow/core/util/stat_summarizer.h
index 9a82bdc3e18..c5dea66c65c 100644
--- a/tensorflow/core/util/stat_summarizer.h
+++ b/tensorflow/core/util/stat_summarizer.h
@@ -69,9 +69,9 @@ class Stat {
                    : static_cast<HighPrecisionValueType>(sum_) / count_;
   }
 
-  ValueType rms() const { return sqrt(squared_sum_ / count_); }
-
-  ValueType std_deviation() const { return all_same() ? 0 : rms() - avg(); }
+  ValueType std_deviation() const {
+    return all_same() ? 0 : sqrt(squared_sum_ / count_ - avg() * avg());
+  }
 
   void OutputToStream(std::ostream* stream) const {
     if (empty()) {
diff --git a/tensorflow/examples/skflow/iris_val_based_early_stopping.py b/tensorflow/examples/skflow/iris_val_based_early_stopping.py
index e6e0b7d76d9..72e0595544f 100644
--- a/tensorflow/examples/skflow/iris_val_based_early_stopping.py
+++ b/tensorflow/examples/skflow/iris_val_based_early_stopping.py
@@ -18,35 +18,38 @@ from __future__ import print_function
 from sklearn import datasets
 from sklearn import metrics
 from sklearn.cross_validation import train_test_split
+import tensorflow as tf
 
 from tensorflow.contrib import learn
 
 
-iris = datasets.load_iris()
-X_train, X_test, y_train, y_test = train_test_split(iris.data,
-                                                    iris.target,
-                                                    test_size=0.2,
-                                                    random_state=42)
+def main(unused_argv):
+  iris = datasets.load_iris()
+  x_train, x_test, y_train, y_test = train_test_split(
+      iris.data, iris.target, test_size=0.2, random_state=42)
 
-X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
-                                                  test_size=0.2,
-                                                  random_state=42)
-val_monitor = learn.monitors.ValidationMonitor(X_val, y_val,
-                                               early_stopping_rounds=200)
+  x_train, x_val, y_train, y_val = train_test_split(
+      x_train, y_train, test_size=0.2, random_state=42)
+  val_monitor = learn.monitors.ValidationMonitor(
+      x_val, y_val, early_stopping_rounds=200)
 
-# classifier with early stopping on training data
-classifier1 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
-                                            n_classes=3,
-                                            model_dir='/tmp/iris_model/')
-classifier1.fit(X_train, y_train, steps=2000)
-score1 = metrics.accuracy_score(y_test, classifier1.predict(X_test))
+  # classifier with early stopping on training data
+  classifier1 = learn.TensorFlowDNNClassifier(
+      hidden_units=[10, 20, 10], n_classes=3, model_dir='/tmp/iris_model/')
+  classifier1.fit(x=x_train, y=y_train, steps=2000)
+  score1 = metrics.accuracy_score(y_test, classifier1.predict(x_test))
 
-# classifier with early stopping on validation data
-classifier2 = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
-                                            n_classes=3,
-                                            model_dir='/tmp/iris_model_val/')
-classifier2.fit(X_train, y_train, val_monitor, steps=2000)
-score2 = metrics.accuracy_score(y_test, classifier2.predict(X_test))
+  # classifier with early stopping on validation data, save frequently for
+  # monitor to pick up new checkpoints.
+  classifier2 = learn.TensorFlowDNNClassifier(
+      hidden_units=[10, 20, 10], n_classes=3, model_dir='/tmp/iris_model_val/',
+      config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
+  classifier2.fit(x=x_train, y=y_train, steps=2000, monitors=[val_monitor])
+  score2 = metrics.accuracy_score(y_test, classifier2.predict(x_test))
 
-# In many applications, the score is improved by using early stopping
-print(score2 > score1)
+  # In many applications, the score is improved by using early stopping
+  print(score2 > score1)
+
+
+if __name__ == '__main__':
+  tf.app.run()
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index 91bc69c6b21..5c1b2f1eed9 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -164,7 +164,7 @@ def train():
                               feed_dict=feed_dict(True),
                               options=run_options,
                               run_metadata=run_metadata)
-        train_writer.add_run_metadata(run_metadata, 'step%d' % i)
+        train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
         train_writer.add_summary(summary, i)
         print('Adding run metadata for', i)
       else:  # Record a summary
diff --git a/tensorflow/g3doc/api_docs/python/array_ops.md b/tensorflow/g3doc/api_docs/python/array_ops.md
index b80026bd3a2..74bdd822e42 100644
--- a/tensorflow/g3doc/api_docs/python/array_ops.md
+++ b/tensorflow/g3doc/api_docs/python/array_ops.md
@@ -216,7 +216,7 @@ This operation returns a 1-D integer tensor representing the shape of `input`.
 
 For example:
 
-```prettyprint
+```python
 # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
 shape(t) ==> [2, 2, 3]
 ```
@@ -224,7 +224,7 @@ shape(t) ==> [2, 2, 3]
 ##### Args:
 
 
-*  <b>`input`</b>: A `Tensor`.
+*  <b>`input`</b>: A `Tensor` or `SparseTensor`.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.md b/tensorflow/g3doc/api_docs/python/contrib.learn.md
index 7e8228938a0..b764ca43534 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.learn.md
@@ -796,11 +796,11 @@ A classifier for TensorFlow DNN models.
 
     def input_fn_eval: # returns x, Y
       pass
-    estimator.evaluate(input_fn_eval)
-    estimator.predict(x)
+    estimator.evaluate(input_fn=input_fn_eval)
+    estimator.predict(x=x)
     ```
 
-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
     otherwise there will be a `KeyError`:
       if `weight_column_name` is not `None`, a feature with
         `key=weight_column_name` whose value is a `Tensor`.
@@ -1140,11 +1140,11 @@ A regressor for TensorFlow DNN models.
 
     def input_fn_eval: # returns x, Y
       pass
-    estimator.evaluate(input_fn_eval)
-    estimator.predict(x)
+    estimator.evaluate(input_fn=input_fn_eval)
+    estimator.predict(x=x)
     ```
 
-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
     otherwise there will be a `KeyError`:
       if `weight_column_name` is not `None`, a feature with
         `key=weight_column_name` whose value is a `Tensor`.
@@ -2245,10 +2245,10 @@ Linear classifier model.
     ...
   estimator.fit(input_fn=input_fn_train)
   estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x)
+  estimator.predict(x=x)
   ```
 
-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
     otherwise there will be a `KeyError`:
       if `weight_column_name` is not `None`, a feature with
         `key=weight_column_name` whose value is a `Tensor`.
@@ -2579,10 +2579,10 @@ Linear regressor model.
     ...
   estimator.fit(input_fn=input_fn_train)
   estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x)
+  estimator.predict(x=x)
   ```
 
-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
     otherwise there will be a KeyError:
       if `weight_column_name` is not None:
         key=weight_column_name, value=a `Tensor`
@@ -4238,50 +4238,33 @@ Perform various training, evaluation, and inference actions on a graph.
 ### `class tf.contrib.learn.RunConfig` {#RunConfig}
 
 This class specifies the specific configurations for the run.
-
-Parameters:
-  execution_mode: Runners use this flag to execute different tasks, like
-    training vs evaluation. 'all' (the default) executes both training and
-    eval.
-  master: TensorFlow master. Empty string (the default) for local.
-  task: Task id of the replica running the training (default: 0).
-  num_ps_replicas: Number of parameter server tasks to use (default: 0).
-  training_worker_session_startup_stagger_secs: Seconds to sleep between the
-    startup of each worker task session (default: 5).
-  training_worker_max_startup_secs: Max seconds to wait before starting any
-    worker (default: 60).
-  eval_delay_secs: Number of seconds between the beginning of each eval run.
-    If one run takes more than this amount of time, the next run will start
-    immediately once that run completes (default 60).
-  eval_steps: Number of steps to run in each eval (default: 100).
-  num_cores: Number of cores to be used (default: 4).
-  verbose: Controls the verbosity, possible values:
-    0: the algorithm and debug information is muted.
-    1: trainer prints the progress.
-    2: log device placement is printed.
-  gpu_memory_fraction: Fraction of GPU memory used by the process on
-    each GPU uniformly on the same machine.
-  tf_random_seed: Random seed for TensorFlow initializers.
-    Setting this value allows consistency between reruns.
-  keep_checkpoint_max: The maximum number of recent checkpoint files to keep.
-    As new files are created, older files are deleted.
-    If None or 0, all checkpoint files are kept.
-    Defaults to 5 (that is, the 5 most recent checkpoint files are kept.)
-  keep_checkpoint_every_n_hours: Number of hours between each checkpoint
-    to be saved. The default value of 10,000 hours effectively disables
-    the feature.
-
-Attributes:
-  tf_master: Tensorflow master.
-  tf_config: Tensorflow Session Config proto.
-  tf_random_seed: Tensorflow random seed.
-  keep_checkpoint_max: Maximum number of checkpoints to keep.
-  keep_checkpoint_every_n_hours: Number of hours between each checkpoint.
 - - -
 
-#### `tf.contrib.learn.RunConfig.__init__(execution_mode='all', master='', task=0, num_ps_replicas=0, training_worker_session_startup_stagger_secs=5, training_worker_max_startup_secs=60, eval_delay_secs=60, eval_steps=100, num_cores=4, verbose=1, gpu_memory_fraction=1, tf_random_seed=42, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__}
+#### `tf.contrib.learn.RunConfig.__init__(master='', task=0, num_ps_replicas=0, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=42, save_summary_steps=100, save_checkpoints_secs=60, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__}
+
+Constructor.
+
+##### Args:
 
 
+*  <b>`master`</b>: TensorFlow master. Empty string (the default) for local.
+*  <b>`task`</b>: Task id of the replica running the training (default: 0).
+*  <b>`num_ps_replicas`</b>: Number of parameter server tasks to use (default: 0).
+*  <b>`num_cores`</b>: Number of cores to be used (default: 4).
+*  <b>`log_device_placement`</b>: Log the op placement to devices (default: False).
+*  <b>`gpu_memory_fraction`</b>: Fraction of GPU memory used by the process on
+    each GPU uniformly on the same machine.
+*  <b>`tf_random_seed`</b>: Random seed for TensorFlow initializers.
+    Setting this value allows consistency between reruns.
+*  <b>`save_summary_steps`</b>: Save summaries every this many steps.
+*  <b>`save_checkpoints_secs`</b>: Save checkpoints every this many seconds.
+*  <b>`keep_checkpoint_max`</b>: The maximum number of recent checkpoint files to
+    keep. As new files are created, older files are deleted. If None or 0,
+    all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
+    checkpoint files are kept.)
+*  <b>`keep_checkpoint_every_n_hours`</b>: Number of hours between each checkpoint
+    to be saved. The default value of 10,000 hours effectively disables
+    the feature.
 
 
 
@@ -4394,7 +4377,7 @@ Run `output_dict` tensors `n` times, with the same `feed_dict` each run.
 
 - - -
 
-### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, max_steps=None, fail_on_nan_loss=True, monitors=None)` {#train}
+### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, steps=None, fail_on_nan_loss=True, monitors=None)` {#train}
 
 Train a model.
 
@@ -4437,7 +4420,7 @@ program is terminated with exit code 1.
     `supervisor_save_summaries_steps` seconds when training.
 *  <b>`feed_fn`</b>: A function that is called every iteration to produce a `feed_dict`
     passed to `session.run` calls. Optional.
-*  <b>`max_steps`</b>: Train until `global_step_tensor` evaluates to this value.
+*  <b>`steps`</b>: Trains for this many steps (e.g. current global step + `steps`).
 *  <b>`fail_on_nan_loss`</b>: If true, raise `NanLossDuringTrainingError` if `loss_op`
     evaluates to `NaN`. If false, continue training as if nothing happened.
 *  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
index 51796a694cd..9079336f9ce 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
@@ -18,10 +18,10 @@ Linear regressor model.
     ...
   estimator.fit(input_fn=input_fn_train)
   estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x)
+  estimator.predict(x=x)
   ```
 
-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
     otherwise there will be a KeyError:
       if `weight_column_name` is not None:
         key=weight_column_name, value=a `Tensor`
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
index 5bc561962b3..9d5dc8bd7bc 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
@@ -18,10 +18,10 @@ Linear classifier model.
     ...
   estimator.fit(input_fn=input_fn_train)
   estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x)
+  estimator.predict(x=x)
   ```
 
-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
     otherwise there will be a `KeyError`:
       if `weight_column_name` is not `None`, a feature with
         `key=weight_column_name` whose value is a `Tensor`.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.train.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.train.md
index 65057636ce7..33ec7f0d532 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.train.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.train.md
@@ -1,4 +1,4 @@
-### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, max_steps=None, fail_on_nan_loss=True, monitors=None)` {#train}
+### `tf.contrib.learn.train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, steps=None, fail_on_nan_loss=True, monitors=None)` {#train}
 
 Train a model.
 
@@ -41,7 +41,7 @@ program is terminated with exit code 1.
     `supervisor_save_summaries_steps` seconds when training.
 *  <b>`feed_fn`</b>: A function that is called every iteration to produce a `feed_dict`
     passed to `session.run` calls. Optional.
-*  <b>`max_steps`</b>: Train until `global_step_tensor` evaluates to this value.
+*  <b>`steps`</b>: Trains for this many steps (e.g. current global step + `steps`).
 *  <b>`fail_on_nan_loss`</b>: If true, raise `NanLossDuringTrainingError` if `loss_op`
     evaluates to `NaN`. If false, continue training as if nothing happened.
 *  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.shape.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.shape.md
index 4262f41a3d3..4cbbcf4ab15 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.shape.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.shape.md
@@ -6,7 +6,7 @@ This operation returns a 1-D integer tensor representing the shape of `input`.
 
 For example:
 
-```prettyprint
+```python
 # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
 shape(t) ==> [2, 2, 3]
 ```
@@ -14,7 +14,7 @@ shape(t) ==> [2, 2, 3]
 ##### Args:
 
 
-*  <b>`input`</b>: A `Tensor`.
+*  <b>`input`</b>: A `Tensor` or `SparseTensor`.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
index 645304ee74c..c68a339de35 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
@@ -21,11 +21,11 @@ A classifier for TensorFlow DNN models.
 
     def input_fn_eval: # returns x, Y
       pass
-    estimator.evaluate(input_fn_eval)
-    estimator.predict(x)
+    estimator.evaluate(input_fn=input_fn_eval)
+    estimator.predict(x=x)
     ```
 
-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
     otherwise there will be a `KeyError`:
       if `weight_column_name` is not `None`, a feature with
         `key=weight_column_name` whose value is a `Tensor`.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md
index ffdf8703c09..35a71be5f8c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md
@@ -1,47 +1,30 @@
 This class specifies the specific configurations for the run.
+- - -
 
-Parameters:
-  execution_mode: Runners use this flag to execute different tasks, like
-    training vs evaluation. 'all' (the default) executes both training and
-    eval.
-  master: TensorFlow master. Empty string (the default) for local.
-  task: Task id of the replica running the training (default: 0).
-  num_ps_replicas: Number of parameter server tasks to use (default: 0).
-  training_worker_session_startup_stagger_secs: Seconds to sleep between the
-    startup of each worker task session (default: 5).
-  training_worker_max_startup_secs: Max seconds to wait before starting any
-    worker (default: 60).
-  eval_delay_secs: Number of seconds between the beginning of each eval run.
-    If one run takes more than this amount of time, the next run will start
-    immediately once that run completes (default 60).
-  eval_steps: Number of steps to run in each eval (default: 100).
-  num_cores: Number of cores to be used (default: 4).
-  verbose: Controls the verbosity, possible values:
-    0: the algorithm and debug information is muted.
-    1: trainer prints the progress.
-    2: log device placement is printed.
-  gpu_memory_fraction: Fraction of GPU memory used by the process on
+#### `tf.contrib.learn.RunConfig.__init__(master='', task=0, num_ps_replicas=0, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, tf_random_seed=42, save_summary_steps=100, save_checkpoints_secs=60, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__}
+
+Constructor.
+
+##### Args:
+
+
+*  <b>`master`</b>: TensorFlow master. Empty string (the default) for local.
+*  <b>`task`</b>: Task id of the replica running the training (default: 0).
+*  <b>`num_ps_replicas`</b>: Number of parameter server tasks to use (default: 0).
+*  <b>`num_cores`</b>: Number of cores to be used (default: 4).
+*  <b>`log_device_placement`</b>: Log the op placement to devices (default: False).
+*  <b>`gpu_memory_fraction`</b>: Fraction of GPU memory used by the process on
     each GPU uniformly on the same machine.
-  tf_random_seed: Random seed for TensorFlow initializers.
+*  <b>`tf_random_seed`</b>: Random seed for TensorFlow initializers.
     Setting this value allows consistency between reruns.
-  keep_checkpoint_max: The maximum number of recent checkpoint files to keep.
-    As new files are created, older files are deleted.
-    If None or 0, all checkpoint files are kept.
-    Defaults to 5 (that is, the 5 most recent checkpoint files are kept.)
-  keep_checkpoint_every_n_hours: Number of hours between each checkpoint
+*  <b>`save_summary_steps`</b>: Save summaries every this many steps.
+*  <b>`save_checkpoints_secs`</b>: Save checkpoints every this many seconds.
+*  <b>`keep_checkpoint_max`</b>: The maximum number of recent checkpoint files to
+    keep. As new files are created, older files are deleted. If None or 0,
+    all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
+    checkpoint files are kept.)
+*  <b>`keep_checkpoint_every_n_hours`</b>: Number of hours between each checkpoint
     to be saved. The default value of 10,000 hours effectively disables
     the feature.
 
-Attributes:
-  tf_master: Tensorflow master.
-  tf_config: Tensorflow Session Config proto.
-  tf_random_seed: Tensorflow random seed.
-  keep_checkpoint_max: Maximum number of checkpoints to keep.
-  keep_checkpoint_every_n_hours: Number of hours between each checkpoint.
-- - -
-
-#### `tf.contrib.learn.RunConfig.__init__(execution_mode='all', master='', task=0, num_ps_replicas=0, training_worker_session_startup_stagger_secs=5, training_worker_max_startup_secs=60, eval_delay_secs=60, eval_steps=100, num_cores=4, verbose=1, gpu_memory_fraction=1, tf_random_seed=42, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000)` {#RunConfig.__init__}
-
-
-
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md
index 6d53d84c5b7..93fe03b2d78 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.sparse_softmax_cross_entropy_with_logits.md
@@ -18,21 +18,28 @@ a probability distribution for each entry, see
 on `logits` internally for efficiency.  Do not call this op with the
 output of `softmax`, as it will produce incorrect results.
 
-`logits` must have the shape `[batch_size, num_classes]`
-and dtype `float32` or `float64`.
-
-`labels` must have the shape `[batch_size]` and dtype `int32` or `int64`.
+A common use case is to have logits of shape `[batch_size, num_classes]` and
+labels of shape `[batch_size]`. But higher dimensions are supported.
 
 ##### Args:
 
 
-*  <b>`logits`</b>: Unscaled log probabilities.
-*  <b>`labels`</b>: Each entry `labels[i]` must be an index in `[0, num_classes)`. Other
-    values will result in a loss of 0, but incorrect gradient computations.
+*  <b>`logits`</b>: Unscaled log probabilities of rank `r` and shape
+    `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
+*  <b>`labels`</b>: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
+    `int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
+    Other values will result in a loss of 0, but incorrect gradient
+    computations.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
 
-  A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-  softmax cross entropy loss.
+  A `Tensor` of the same shape as `labels` and of the same type as `logits`
+  with the softmax cross entropy loss.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If logits are scalars (need to have rank >= 1) or if the rank
+    of the labels is not equal to the rank of the labels minus one.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
index 581ba4e57e0..f31650eb29b 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
@@ -21,11 +21,11 @@ A regressor for TensorFlow DNN models.
 
     def input_fn_eval: # returns x, Y
       pass
-    estimator.evaluate(input_fn_eval)
-    estimator.predict(x)
+    estimator.evaluate(input_fn=input_fn_eval)
+    estimator.predict(x=x)
     ```
 
-  Input of `fit`, `train`, and `evaluate` should have following features,
+  Input of `fit` and `evaluate` should have following features,
     otherwise there will be a `KeyError`:
       if `weight_column_name` is not `None`, a feature with
         `key=weight_column_name` whose value is a `Tensor`.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.max_pool_with_argmax.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.max_pool_with_argmax.md
index 0bf84c16d06..5424efd7a76 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.max_pool_with_argmax.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.max_pool_with_argmax.md
@@ -9,7 +9,7 @@ The indices in `argmax` are flattened, so that a maximum value at position
 ##### Args:
 
 
-*  <b>`input`</b>: A `Tensor` of type `float32`.
+*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `half`.
     4-D with shape `[batch, height, width, channels]`.  Input to pool over.
 *  <b>`ksize`</b>: A list of `ints` that has length `>= 4`.
     The size of the window for each dimension of the input tensor.
@@ -25,6 +25,6 @@ The indices in `argmax` are flattened, so that a maximum value at position
 
   A tuple of `Tensor` objects (output, argmax).
 
-*  <b>`output`</b>: A `Tensor` of type `float32`. The max pooled output tensor.
+*  <b>`output`</b>: A `Tensor`. Has the same type as `input`. The max pooled output tensor.
 *  <b>`argmax`</b>: A `Tensor` of type `Targmax`. 4-D.  The flattened indices of the max values chosen for each output.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_mask.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_mask.md
index d2fa38733b2..4dcd98e6897 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_mask.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_mask.md
@@ -3,8 +3,8 @@
 Masks elements of `IndexedSlices`.
 
 Given an `IndexedSlices` instance `a`, returns another `IndexedSlices` that
-contains a subset of the slices of `a`. Only the slices at indices specified
-in `mask_indices` are returned.
+contains a subset of the slices of `a`. Only the slices at indices not
+specified in `mask_indices` are returned.
 
 This is useful when you need to extract a subset of slices in an
 `IndexedSlices` object.
@@ -18,7 +18,7 @@ a.indices => [12, 26, 37, 45]
 tf.shape(a.values) => [4, 10]
 
 # `b` will be the subset of `a` slices at its second and third indices, so
-# we want to mask of its first and last indices (which are at absolute
+# we want to mask its first and last indices (which are at absolute
 # indices 12, 45)
 b = tf.sparse_mask(a, [12, 45])
 
diff --git a/tensorflow/g3doc/api_docs/python/nn.md b/tensorflow/g3doc/api_docs/python/nn.md
index a6dfbc297db..3c961b8215b 100644
--- a/tensorflow/g3doc/api_docs/python/nn.md
+++ b/tensorflow/g3doc/api_docs/python/nn.md
@@ -690,7 +690,7 @@ The indices in `argmax` are flattened, so that a maximum value at position
 ##### Args:
 
 
-*  <b>`input`</b>: A `Tensor` of type `float32`.
+*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float32`, `half`.
     4-D with shape `[batch, height, width, channels]`.  Input to pool over.
 *  <b>`ksize`</b>: A list of `ints` that has length `>= 4`.
     The size of the window for each dimension of the input tensor.
@@ -706,7 +706,7 @@ The indices in `argmax` are flattened, so that a maximum value at position
 
   A tuple of `Tensor` objects (output, argmax).
 
-*  <b>`output`</b>: A `Tensor` of type `float32`. The max pooled output tensor.
+*  <b>`output`</b>: A `Tensor`. Has the same type as `input`. The max pooled output tensor.
 *  <b>`argmax`</b>: A `Tensor` of type `Targmax`. 4-D.  The flattened indices of the max values chosen for each output.
 
 
@@ -1244,23 +1244,30 @@ a probability distribution for each entry, see
 on `logits` internally for efficiency.  Do not call this op with the
 output of `softmax`, as it will produce incorrect results.
 
-`logits` must have the shape `[batch_size, num_classes]`
-and dtype `float32` or `float64`.
-
-`labels` must have the shape `[batch_size]` and dtype `int32` or `int64`.
+A common use case is to have logits of shape `[batch_size, num_classes]` and
+labels of shape `[batch_size]`. But higher dimensions are supported.
 
 ##### Args:
 
 
-*  <b>`logits`</b>: Unscaled log probabilities.
-*  <b>`labels`</b>: Each entry `labels[i]` must be an index in `[0, num_classes)`. Other
-    values will result in a loss of 0, but incorrect gradient computations.
+*  <b>`logits`</b>: Unscaled log probabilities of rank `r` and shape
+    `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
+*  <b>`labels`</b>: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
+    `int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
+    Other values will result in a loss of 0, but incorrect gradient
+    computations.
 *  <b>`name`</b>: A name for the operation (optional).
 
 ##### Returns:
 
-  A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-  softmax cross entropy loss.
+  A `Tensor` of the same shape as `labels` and of the same type as `logits`
+  with the softmax cross entropy loss.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If logits are scalars (need to have rank >= 1) or if the rank
+    of the labels is not equal to the rank of the labels minus one.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/state_ops.md b/tensorflow/g3doc/api_docs/python/state_ops.md
index 4f5c0a7af58..68cd7d33cef 100644
--- a/tensorflow/g3doc/api_docs/python/state_ops.md
+++ b/tensorflow/g3doc/api_docs/python/state_ops.md
@@ -1895,8 +1895,8 @@ Requires `updates.shape = indices.shape + ref.shape[1:]`.
 Masks elements of `IndexedSlices`.
 
 Given an `IndexedSlices` instance `a`, returns another `IndexedSlices` that
-contains a subset of the slices of `a`. Only the slices at indices specified
-in `mask_indices` are returned.
+contains a subset of the slices of `a`. Only the slices at indices not
+specified in `mask_indices` are returned.
 
 This is useful when you need to extract a subset of slices in an
 `IndexedSlices` object.
@@ -1910,7 +1910,7 @@ a.indices => [12, 26, 37, 45]
 tf.shape(a.values) => [4, 10]
 
 # `b` will be the subset of `a` slices at its second and third indices, so
-# we want to mask of its first and last indices (which are at absolute
+# we want to mask its first and last indices (which are at absolute
 # indices 12, 45)
 b = tf.sparse_mask(a, [12, 45])
 
diff --git a/tensorflow/g3doc/how_tos/reading_data/index.md b/tensorflow/g3doc/how_tos/reading_data/index.md
index 554cb854db7..b7ae72c9164 100644
--- a/tensorflow/g3doc/how_tos/reading_data/index.md
+++ b/tensorflow/g3doc/how_tos/reading_data/index.md
@@ -10,7 +10,7 @@ There are three main methods of getting data into a TensorFlow program:
 
 [TOC]
 
-## Feeding 
+## Feeding
 
 TensorFlow's feed mechanism lets you inject data into any Tensor in a
 computation graph. A python computation can thus feed data directly into the
@@ -377,11 +377,11 @@ Again, the example queue will have some elements queued, so training will
 continue until those are exhausted.  If the example queue is a
 [`RandomShuffleQueue`](../../api_docs/python/io_ops.md#RandomShuffleQueue), say
 because you are using `shuffle_batch` or `shuffle_batch_join`, it normally will
-avoid ever going having fewer than its `min_after_dequeue` attr elements
-buffered.  However, once the queue is closed that restriction will be lifted and
-the queue will eventually empty.  At that point the actual training threads,
-when they try and dequeue from example queue, will start getting `OutOfRange`
-errors and exiting.  Once all the training threads are done,
+avoid ever having fewer than its `min_after_dequeue` attr elements buffered.
+However, once the queue is closed that restriction will be lifted and the queue
+will eventually empty.  At that point the actual training threads, when they
+try and dequeue from example queue, will start getting `OutOfRange` errors and
+exiting.  Once all the training threads are done,
 [`tf.train.Coordinator.join`](../../api_docs/python/train.md#Coordinator.join)
 will return and you can exit cleanly.
 
diff --git a/tensorflow/g3doc/how_tos/threading_and_queues/index.md b/tensorflow/g3doc/how_tos/threading_and_queues/index.md
index c6124f92f14..46444a02dbe 100644
--- a/tensorflow/g3doc/how_tos/threading_and_queues/index.md
+++ b/tensorflow/g3doc/how_tos/threading_and_queues/index.md
@@ -146,7 +146,7 @@ for step in xrange(1000000):
 # When done, ask the threads to stop.
 coord.request_stop()
 # And wait for them to actually do it.
-coord.join(threads)
+coord.join(enqueue_threads)
 ```
 
 ## Handling Exceptions
diff --git a/tensorflow/g3doc/tutorials/recurrent/index.md b/tensorflow/g3doc/tutorials/recurrent/index.md
index b5afc186597..5ed26a5e040 100644
--- a/tensorflow/g3doc/tutorials/recurrent/index.md
+++ b/tensorflow/g3doc/tutorials/recurrent/index.md
@@ -178,9 +178,9 @@ https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/get_starte
 [bazel](https://github.com/bazelbuild/bazel)).
 
 Next:
-```
+```bash
 cd tensorflow/models/rnn/ptb
-python ptb_word_lm --data_path=/tmp/simple-examples/data/ --model small
+python ptb_word_lm.py --data_path=/tmp/simple-examples/data/ --model small
 ```
 
 There are 3 supported model configurations in the tutorial code: "small",
diff --git a/tensorflow/models/image/mnist/convolutional.py b/tensorflow/models/image/mnist/convolutional.py
index 95e5347c62c..1893e681210 100644
--- a/tensorflow/models/image/mnist/convolutional.py
+++ b/tensorflow/models/image/mnist/convolutional.py
@@ -48,9 +48,19 @@ EVAL_FREQUENCY = 100  # Number of steps between evaluations.
 
 
 tf.app.flags.DEFINE_boolean("self_test", False, "True if running a self test.")
+tf.app.flags.DEFINE_boolean('use_fp16', False,
+                            "Use half floats instead of full floats if True.")
 FLAGS = tf.app.flags.FLAGS
 
 
+def data_type():
+  """Return the type of the activations, weights, and placeholder variables."""
+  if FLAGS.use_fp16:
+    return tf.float16
+  else:
+    return tf.float32
+
+
 def maybe_download(filename):
   """Download the data from Yann's website, unless it's already here."""
   if not tf.gfile.Exists(WORK_DIRECTORY):
@@ -142,11 +152,11 @@ def main(argv=None):  # pylint: disable=unused-argument
   # These placeholder nodes will be fed a batch of training data at each
   # training step using the {feed_dict} argument to the Run() call below.
   train_data_node = tf.placeholder(
-      tf.float32,
+      data_type(),
       shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
   train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,))
   eval_data = tf.placeholder(
-      tf.float32,
+      data_type(),
       shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
 
   # The variables below hold all the trainable weights. They are passed an
@@ -155,24 +165,24 @@ def main(argv=None):  # pylint: disable=unused-argument
   conv1_weights = tf.Variable(
       tf.truncated_normal([5, 5, NUM_CHANNELS, 32],  # 5x5 filter, depth 32.
                           stddev=0.1,
-                          seed=SEED))
-  conv1_biases = tf.Variable(tf.zeros([32]))
-  conv2_weights = tf.Variable(
-      tf.truncated_normal([5, 5, 32, 64],
-                          stddev=0.1,
-                          seed=SEED))
-  conv2_biases = tf.Variable(tf.constant(0.1, shape=[64]))
+                          seed=SEED, dtype=data_type()))
+  conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type()))
+  conv2_weights = tf.Variable(tf.truncated_normal(
+      [5, 5, 32, 64], stddev=0.1,
+      seed=SEED, dtype=data_type()))
+  conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type()))
   fc1_weights = tf.Variable(  # fully connected, depth 512.
-      tf.truncated_normal(
-          [IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512],
-          stddev=0.1,
-          seed=SEED))
-  fc1_biases = tf.Variable(tf.constant(0.1, shape=[512]))
-  fc2_weights = tf.Variable(
-      tf.truncated_normal([512, NUM_LABELS],
+      tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512],
                           stddev=0.1,
-                          seed=SEED))
-  fc2_biases = tf.Variable(tf.constant(0.1, shape=[NUM_LABELS]))
+                          seed=SEED,
+                          dtype=data_type()))
+  fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type()))
+  fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS],
+                                                stddev=0.1,
+                                                seed=SEED,
+                                                dtype=data_type()))
+  fc2_biases = tf.Variable(tf.constant(
+      0.1, shape=[NUM_LABELS], dtype=data_type()))
 
   # We will replicate the model structure for the training subgraph, as well
   # as the evaluation subgraphs, while sharing the trainable parameters.
@@ -230,7 +240,7 @@ def main(argv=None):  # pylint: disable=unused-argument
 
   # Optimizer: set up a variable that's incremented once per batch and
   # controls the learning rate decay.
-  batch = tf.Variable(0)
+  batch = tf.Variable(0, dtype=data_type())
   # Decay once per epoch, using an exponential schedule starting at 0.01.
   learning_rate = tf.train.exponential_decay(
       0.01,                # Base learning rate.
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 333bfa17f95..011078036d0 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -99,6 +99,42 @@ def GetShrunkInceptionMaxPoolShapes(shrink=30):
 
 class PoolingTest(tf.test.TestCase):
 
+  def _VerifyOneType(self, pool_func, input_sizes, ksize, strides, padding,
+                     data_format, data_type, expected, use_gpu):
+    """Verifies the output values of the pooling function.
+
+    Args:
+      pool_func: Function to be called, co.MaxPool, co.AvgPool,
+        or the Lua version.
+      input_sizes: Input tensor dimensions.
+      ksize: The kernel size dimensions
+      strides: The stride dimensions
+      padding: Padding type.
+      data_format: The data format we use to run the pooling operation.
+      data_type: The data type to use to run the pooling operation.
+      expected: An array containing the expected operation outputs.
+      use_gpu: Whether we are running on GPU.
+    """
+    total_size = 1
+    for s in input_sizes:
+      total_size *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x = [f * 1.0 for f in range(1, total_size + 1)]
+    with self.test_session(use_gpu=use_gpu) as sess:
+      t = tf.constant(x, shape=input_sizes, dtype=data_type)
+      if data_format == "NCHW":
+        t = NHWCToNCHW(t)
+        ksize = NHWCToNCHW(ksize)
+        strides = NHWCToNCHW(strides)
+      t = pool_func(t, ksize=ksize, strides=strides, padding=padding,
+                    data_format=data_format)
+      if data_format == "NCHW":
+        t = NCHWToNHWC(t)
+      actual = t.eval()
+      self.assertAllCloseAccordingToType(expected, actual.flatten())
+      self.assertShapeEqual(actual, t)
+
   def _VerifyOneTest(self, pool_func, input_sizes, ksize, strides, padding,
                      data_format, expected, use_gpu):
     """Verifies the output values of the pooling function.
@@ -114,25 +150,12 @@ class PoolingTest(tf.test.TestCase):
       expected: An array containing the expected operation outputs.
       use_gpu: Whether we are running on GPU.
     """
-    total_size = 1
-    for s in input_sizes:
-      total_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x = [f * 1.0 for f in range(1, total_size + 1)]
-    with self.test_session(use_gpu=use_gpu) as sess:
-      t = tf.constant(x, shape=input_sizes)
-      if data_format == "NCHW":
-        t = NHWCToNCHW(t)
-        ksize = NHWCToNCHW(ksize)
-        strides = NHWCToNCHW(strides)
-      t = pool_func(t, ksize=ksize, strides=strides, padding=padding,
-                    data_format=data_format)
-      if data_format == "NCHW":
-        t = NCHWToNHWC(t)
-      actual = t.eval()
-      self.assertAllClose(expected, actual.flatten())
-      self.assertShapeEqual(actual, t)
+    self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
+                        data_format, tf.float32, expected, use_gpu)
+
+    if not use_gpu or test_util.CudaSupportsHalfMatMulAndConv():
+      self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
+                          data_format, tf.float16, expected, use_gpu)
 
   def _VerifyValues(self, pool_func, input_sizes, ksize, strides, padding,
                     expected, use_gpu):
@@ -372,32 +395,40 @@ class PoolingTest(tf.test.TestCase):
 
   def testKernelSmallerThanStrideValid(self):
     for use_gpu in [True, False]:
-        self._VerifyValues(tf.nn.max_pool, input_sizes=[1, 7, 7, 1],
-                           ksize=[1, 2, 2, 1], strides=[1, 3, 3, 1],
-                           padding="VALID",
-                           expected=[9, 12, 30, 33],
-                           use_gpu=use_gpu)
+      self._VerifyValues(tf.nn.max_pool,
+                         input_sizes=[1, 7, 7, 1],
+                         ksize=[1, 2, 2, 1],
+                         strides=[1, 3, 3, 1],
+                         padding="VALID",
+                         expected=[9, 12, 30, 33],
+                         use_gpu=use_gpu)
 
-        self._VerifyValues(tf.nn.avg_pool, input_sizes=[1, 7, 7, 1],
-                           ksize=[1, 2, 2, 1], strides=[1, 3, 3, 1],
-                           padding="VALID",
-                           expected=[5, 8, 26, 29],
-                           use_gpu=use_gpu)
+      self._VerifyValues(tf.nn.avg_pool,
+                         input_sizes=[1, 7, 7, 1],
+                         ksize=[1, 2, 2, 1],
+                         strides=[1, 3, 3, 1],
+                         padding="VALID",
+                         expected=[5, 8, 26, 29],
+                         use_gpu=use_gpu)
 
   def testKernelSmallerThanStrideSame(self):
     for use_gpu in [True, False]:
-        for pool_func in [tf.nn.max_pool, tf.nn.avg_pool]:
-            self._VerifyValues(pool_func, input_sizes=[1, 3, 3, 1],
-                               ksize=[1, 1, 1, 1], strides=[1, 2, 2, 1],
-                               padding="SAME",
-                               expected=[1, 3, 7, 9],
-                               use_gpu=use_gpu)
+      for pool_func in [tf.nn.max_pool, tf.nn.avg_pool]:
+        self._VerifyValues(pool_func,
+                           input_sizes=[1, 3, 3, 1],
+                           ksize=[1, 1, 1, 1],
+                           strides=[1, 2, 2, 1],
+                           padding="SAME",
+                           expected=[1, 3, 7, 9],
+                           use_gpu=use_gpu)
 
-            self._VerifyValues(pool_func, input_sizes=[1, 4, 4, 1],
-                               ksize=[1, 1, 1, 1], strides=[1, 2, 2, 1],
-                               padding="SAME",
-                               expected=[1, 3, 9, 11],
-                               use_gpu=use_gpu)
+        self._VerifyValues(pool_func,
+                           input_sizes=[1, 4, 4, 1],
+                           ksize=[1, 1, 1, 1],
+                           strides=[1, 2, 2, 1],
+                           padding="SAME",
+                           expected=[1, 3, 9, 11],
+                           use_gpu=use_gpu)
 
   def _testDepthwiseMaxPoolInvalidConfig(self, in_size, ksize, strides,
                                          error_msg, use_gpu=False):
@@ -425,43 +456,49 @@ class PoolingTest(tf.test.TestCase):
   # The following are tests that verify that the CPU and GPU implementations
   # produce the same resuts.
   def _CompareMaxPoolingFwd(self, input_shape, ksize, strides, padding):
-    tensor_input = np.random.rand(*input_shape).astype(np.float32)
-    with self.test_session(use_gpu=True):
-      t = tf.constant(tensor_input, shape=input_shape)
-      out_op, _ = tf.nn.max_pool_with_argmax(t, ksize, strides, padding)
-      gpu_val = out_op.eval()
-    with self.test_session(use_gpu=False):
-      t = tf.constant(tensor_input, shape=input_shape)
-      out_op = tf.nn.max_pool(t, ksize, strides, padding)
-      cpu_val = out_op.eval()
-    self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
+    for dtype in np.float32, np.float16:
+      tensor_input = np.random.rand(*input_shape).astype(dtype)
+      with self.test_session(use_gpu=True):
+        t = tf.constant(tensor_input, shape=input_shape)
+        out_op, _ = tf.nn.max_pool_with_argmax(t, ksize, strides, padding)
+        gpu_val = out_op.eval()
+      with self.test_session(use_gpu=False):
+        t = tf.constant(tensor_input, shape=input_shape)
+        out_op = tf.nn.max_pool(t, ksize, strides, padding)
+        cpu_val = out_op.eval()
+      self.assertAllCloseAccordingToType(cpu_val, gpu_val)
 
   def _CompareMaxPoolingBk(self, input_shape, output_shape, ksize, strides,
                            padding):
-    # Generate numbers in a narrow range, so that there are many duplicates
-    # in the input.
-    tensor_input = np.random.random_integers(0, 3,
-                                             input_shape).astype(np.float32)
-    tensor_output = np.random.rand(*output_shape).astype(np.float32)
-    with self.test_session(use_gpu=True):
-      t = tf.constant(tensor_input, shape=input_shape)
-      _, argmax_op = tf.nn.max_pool_with_argmax(t, ksize, strides, padding)
-      argmax = argmax_op.eval()
-      grad_in = tf.constant(tensor_output, shape=output_shape)
-      out_op = gen_nn_ops._max_pool_grad_with_argmax(t, grad_in, argmax,
-                                                     ksize, strides, padding)
-      gpu_val = out_op.eval()
-      self.assertShapeEqual(gpu_val, out_op)
-    with self.test_session(use_gpu=False):
-      t = tf.constant(tensor_input, shape=input_shape)
-      out_op = tf.nn.max_pool(t, ksize, strides, padding)
-      orig_out = out_op.eval()
-      grad_in = tf.constant(tensor_output, shape=output_shape)
-      out_op = gen_nn_ops._max_pool_grad(t, orig_out, grad_in, ksize,
-                                         strides, padding)
-      cpu_val = out_op.eval()
-      self.assertShapeEqual(cpu_val, out_op)
-    self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
+    for dtype in np.float32, np.float16:
+      # Generate numbers in a narrow range, so that there are many duplicates
+      # in the input.
+      tensor_input = np.random.random_integers(0, 3, input_shape).astype(dtype)
+      tensor_output = np.random.rand(*output_shape).astype(dtype)
+      with self.test_session(use_gpu=True):
+        t = tf.constant(tensor_input, shape=input_shape)
+        _, argmax_op = tf.nn.max_pool_with_argmax(t, ksize, strides, padding)
+        argmax = argmax_op.eval()
+        grad_in = tf.constant(tensor_output, shape=output_shape)
+        out_op = gen_nn_ops._max_pool_grad_with_argmax(t, grad_in, argmax,
+                                                       ksize, strides, padding)
+        gpu_val = out_op.eval()
+        self.assertShapeEqual(gpu_val, out_op)
+      with self.test_session(use_gpu=False):
+        t = tf.constant(tensor_input, shape=input_shape)
+        out_op = tf.nn.max_pool(t, ksize, strides, padding)
+        orig_out = out_op.eval()
+        grad_in = tf.constant(tensor_output, shape=output_shape)
+        out_op = gen_nn_ops._max_pool_grad(t, orig_out, grad_in, ksize, strides,
+                                           padding)
+        cpu_val = out_op.eval()
+        self.assertShapeEqual(cpu_val, out_op)
+      if dtype == np.float16:
+        # The CPU version accumulates its gradient on fp16, so it's less
+        # accurate than the GPU version that does the accumulation on fp32
+        self.assertAllClose(cpu_val, gpu_val, rtol=0.01, atol=0.01)
+      else:
+        self.assertAllClose(cpu_val, gpu_val)
 
   def testMaxPoolingWithArgmax(self):
     # MaxPoolWithArgMax is implemented only on GPU.
diff --git a/tensorflow/python/kernel_tests/sparse_xent_op_test.py b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
index a8050cb08db..eb6bdff8b5a 100644
--- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
@@ -30,6 +30,9 @@ from tensorflow.python.ops import sparse_ops
 class SparseXentTest(tf.test.TestCase):
 
   def _npXent(self, features, labels):
+    is_higher_dim = len(features.shape) > 2
+    features = np.reshape(features, [-1, features.shape[-1]])
+    labels = np.reshape(labels, [-1])
     batch_dim = 0
     class_dim = 1
     batch_size = features.shape[batch_dim]
@@ -40,14 +43,15 @@ class SparseXentTest(tf.test.TestCase):
     labels_mat[np.arange(batch_size), labels] = 1.0
     bp = (probs - labels_mat)
     l = -np.sum(labels_mat * np.log(probs + 1.0e-20), axis=1)
-    return l, bp
+    return l, bp, is_higher_dim
 
   def _testXent(self, np_features, np_labels, use_gpu=False):
-    np_loss, np_backprop = self._npXent(np_features, np_labels)
+    np_loss, np_backprop, is_higher_dim = self._npXent(np_features, np_labels)
     with self.test_session(use_gpu=use_gpu) as sess:
       loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
           np_features, np_labels)
-      backprop = loss.op.outputs[1]
+      backprop = (loss.op.inputs[0].op.outputs[1] if is_higher_dim
+                  else loss.op.outputs[1])
       tf_loss, tf_backprop = sess.run([loss, backprop])
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
     self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
@@ -71,14 +75,6 @@ class SparseXentTest(tf.test.TestCase):
     self._testSingleClass(use_gpu=True)
     self._testSingleClass(use_gpu=False)
 
-  def testRankTooLarge(self):
-    np_features = np.array(
-        [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]).astype(np.float32)
-    np_labels = np.array([1, 2])
-    self.assertRaisesRegexp(
-        ValueError, "must have rank 2",
-        tf.nn.sparse_softmax_cross_entropy_with_logits, np_features, np_labels)
-
   def testNpXent(self):
     # We create 2 batches of logits for testing.
     # batch 0 is the boring uniform distribution: 1, 1, 1, 1, with target 3.
@@ -104,7 +100,7 @@ class SparseXentTest(tf.test.TestCase):
     # With a hard 1, the backprop is [0.032 - 1.0 = -0.968, 0.087, 0.237, 0.644]
     # The loss for this batch is [1.0 * -log(0.25), 1.0 * -log(0.032)]
     # = [1.3862, 3.4420]
-    np_loss, np_backprop = self._npXent(np.array(features), np.array(labels))
+    np_loss, np_backprop, _ = self._npXent(np.array(features), np.array(labels))
     self.assertAllClose(np.array([[0.25, 0.25, 0.25, -0.75],
                                   [-0.968, 0.087, 0.237, 0.6439]]),
                         np_backprop,
@@ -114,15 +110,21 @@ class SparseXentTest(tf.test.TestCase):
 
   def testShapeMismatch(self):
     with self.test_session():
-      with self.assertRaises(ValueError):
+      with self.assertRaisesRegexp(ValueError, ".*Rank mismatch:*"):
         tf.nn.sparse_softmax_cross_entropy_with_logits(
-            [[0., 1.], [2., 3.]], [[0, 2]])
+            [[0., 1.], [2., 3.], [2., 3.]], [[0, 2]])
 
-  def testNotMatrix(self):
+  def testScalar(self):
     with self.test_session():
-      with self.assertRaises(ValueError):
+      with self.assertRaisesRegexp(ValueError, ".*Logits cannot be scalars*"):
         tf.nn.sparse_softmax_cross_entropy_with_logits(
-            [0., 1., 2., 3.], [0, 2])
+            tf.constant(1.0), tf.constant(0))
+
+  def testVector(self):
+    with self.test_session():
+      loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+          tf.constant([1.0]), tf.constant(0))
+      self.assertAllClose(0.0, loss.eval())
 
   def testFloat(self):
     for label_dtype in np.int32, np.int64:
@@ -155,6 +157,31 @@ class SparseXentTest(tf.test.TestCase):
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
 
+  def _testHighDim(self, use_gpu, features, labels):
+    np_loss, np_backprop, _ = self._npXent(np.array(features), np.array(labels))
+    # manually reshape loss
+    np_loss = np.reshape(np_loss, np.array(labels).shape)
+    with self.test_session(use_gpu=use_gpu) as sess:
+      loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+          features, labels)
+      backprop = loss.op.inputs[0].op.outputs[1]
+      tf_loss, tf_backprop = sess.run([loss, backprop])
+    self.assertAllCloseAccordingToType(np_loss, tf_loss)
+    self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
+
+  def testHighDim(self):
+    features = [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]
+    labels = [[3], [0]]
+    self._testHighDim(True, features, labels)
+    self._testHighDim(False, features, labels)
+
+  def testHighDim2(self):
+    features = [[[1., 1., 1., 1.], [2., 2., 2., 2.]],
+                [[1., 2., 3., 4.], [5., 6., 7., 8.]]]
+    labels = [[3, 2], [0, 3]]
+    self._testHighDim(True, features, labels)
+    self._testHighDim(False, features, labels)
+
 
 def _sparse_vs_dense_xent_benchmark_dense(labels, logits):
   labels = tf.identity(labels)
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index c71ec9f065e..fd442c6eb88 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -126,6 +126,7 @@ def shape(input, name=None):
     else:
       return gen_array_ops.shape(input, name=name)
 
+
 def rank(input, name=None):
   """Returns the rank of a tensor.
 
@@ -612,8 +613,8 @@ def sparse_mask(a, mask_indices, name=None):
   """Masks elements of `IndexedSlices`.
 
   Given an `IndexedSlices` instance `a`, returns another `IndexedSlices` that
-  contains a subset of the slices of `a`. Only the slices at indices specified
-  in `mask_indices` are returned.
+  contains a subset of the slices of `a`. Only the slices at indices not
+  specified in `mask_indices` are returned.
 
   This is useful when you need to extract a subset of slices in an
   `IndexedSlices` object.
@@ -627,7 +628,7 @@ def sparse_mask(a, mask_indices, name=None):
   tf.shape(a.values) => [4, 10]
 
   # `b` will be the subset of `a` slices at its second and third indices, so
-  # we want to mask of its first and last indices (which are at absolute
+  # we want to mask its first and last indices (which are at absolute
   # indices 12, 45)
   b = tf.sparse_mask(a, [12, 45])
 
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 8fb81a813ad..baaa6391e95 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -440,30 +440,65 @@ def sparse_softmax_cross_entropy_with_logits(logits, labels, name=None):
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  `logits` must have the shape `[batch_size, num_classes]`
-  and dtype `float32` or `float64`.
-
-  `labels` must have the shape `[batch_size]` and dtype `int32` or `int64`.
+  A common use case is to have logits of shape `[batch_size, num_classes]` and
+  labels of shape `[batch_size]`. But higher dimensions are supported.
 
   Args:
-    logits: Unscaled log probabilities.
-    labels: Each entry `labels[i]` must be an index in `[0, num_classes)`. Other
-      values will result in a loss of 0, but incorrect gradient computations.
+    logits: Unscaled log probabilities of rank `r` and shape
+      `[d_0, d_1, ..., d_{r-2}, num_classes]` and dtype `float32` or `float64`.
+    labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-2}]` and dtype `int32` or
+      `int64`. Each entry in `labels` must be an index in `[0, num_classes)`.
+      Other values will result in a loss of 0, but incorrect gradient
+      computations.
     name: A name for the operation (optional).
 
   Returns:
-    A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
-    softmax cross entropy loss.
+    A `Tensor` of the same shape as `labels` and of the same type as `logits`
+    with the softmax cross entropy loss.
+
+  Raises:
+    ValueError: If logits are scalars (need to have rank >= 1) or if the rank
+      of the labels is not equal to the rank of the labels minus one.
   """
   # TODO(pcmurray) Raise an error when the label is not an index in
   # [0, num_classes). Note: This could break users who call this with bad
   # labels, but disregard the bad results.
 
-  # The second output tensor contains the gradients.  We use it in
-  # _CrossEntropyGrad() in nn_grad but not here.
-  cost, unused_backprop = gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
-      logits, labels, name=name)
-  return cost
+  # Reshape logits and labels to rank 2.
+  with ops.op_scope([labels, logits], name,
+                    "SparseSoftmaxCrossEntropyWithLogits"):
+    labels = ops.convert_to_tensor(labels)
+    logits = ops.convert_to_tensor(logits)
+
+    # Store label shape for result later.
+    labels_static_shape = labels.get_shape()
+    labels_shape = array_ops.shape(labels)
+    if logits.get_shape().ndims is not None and logits.get_shape().ndims == 0:
+      raise ValueError("Logits cannot be scalars - received shape %s.",
+                       logits.get_shape())
+    if logits.get_shape().ndims is not None and (
+        labels_static_shape.ndims is not None and
+        labels_static_shape.ndims != logits.get_shape().ndims - 1):
+      raise ValueError("Rank mismatch: Labels rank (received %s) should equal "
+                       "logits rank (received %s) - 1.",
+                       labels_static_shape.ndims, logits.get_shape().ndims)
+    # Check if no reshapes are required.
+    if logits.get_shape().ndims == 2:
+      cost, _ = gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
+          logits, labels, name=name)
+      return cost
+    # Reshape logits to 2 dim, labels to 1 dim.
+    num_classes = array_ops.gather(array_ops.shape(logits),
+                                   array_ops.rank(logits) - 1)
+    logits = array_ops.reshape(logits, [-1, num_classes])
+    labels = array_ops.reshape(labels, [-1])
+    # The second output tensor contains the gradients.  We use it in
+    # _CrossEntropyGrad() in nn_grad but not here.
+    cost, _ = gen_nn_ops._sparse_softmax_cross_entropy_with_logits(
+        logits, labels, name=name)
+    cost = array_ops.reshape(cost, labels_shape)
+    cost.set_shape(labels_static_shape)
+    return cost
 
 
 @ops.RegisterShape("SparseSoftmaxCrossEntropyWithLogits")
diff --git a/tensorflow/python/training/proximal_adagrad.py b/tensorflow/python/training/proximal_adagrad.py
new file mode 100644
index 00000000000..d1bfe707124
--- /dev/null
+++ b/tensorflow/python/training/proximal_adagrad.py
@@ -0,0 +1,101 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""ProximalAdagrad for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import constant_op
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class ProximalAdagradOptimizer(optimizer.Optimizer):
+  # pylint: disable=line-too-long
+  """Optimizer that implements the Proximal Adagrad algorithm.
+
+  See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
+
+  @@__init__
+  """
+
+  def __init__(self, learning_rate, initial_accumulator_value=0.1,
+               l1_regularization_strength=0.0, l2_regularization_strength=0.0,
+               use_locking=False, name="ProximalAdagrad"):
+    """Construct a new ProximalAdagrad optimizer.
+
+    Args:
+      learning_rate: A `Tensor` or a floating point value.  The learning rate.
+      initial_accumulator_value: A floating point value.
+        Starting value for the accumulators, must be positive.
+      l1_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      l2_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      use_locking: If `True` use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adagrad".
+
+    Raises:
+      ValueError: If the `initial_accumulator_value` is invalid.
+    """
+    if initial_accumulator_value <= 0.0:
+      raise ValueError("initial_accumulator_value must be positive: %s" %
+                       initial_accumulator_value)
+    super(ProximalAdagradOptimizer, self).__init__(use_locking, name)
+    self._learning_rate = learning_rate
+    self._initial_accumulator_value = initial_accumulator_value
+    self._l1_regularization_strength = l1_regularization_strength
+    self._l2_regularization_strength = l2_regularization_strength
+    # Created in Initialize.
+    self._l1_regularization_strength_tensor = None
+    self._l2_regularization_strength_tensor = None
+    self._learning_rate_tensor = None
+
+  def _create_slots(self, var_list):
+    for v in var_list:
+      with ops.colocate_with(v):
+        val = constant_op.constant(self._initial_accumulator_value,
+                                   shape=v.get_shape())
+      self._get_or_make_slot(v, val, "accumulator", self._name)
+
+  def _prepare(self):
+    self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
+                                                       name="learning_rate")
+    self._l1_regularization_strength_tensor = ops.convert_to_tensor(
+        self._l1_regularization_strength,
+        name="l1_regularization_strength")
+    self._l2_regularization_strength_tensor = ops.convert_to_tensor(
+        self._l2_regularization_strength,
+        name="l2_regularization_strength")
+
+  def _apply_dense(self, grad, var):
+    acc = self.get_slot(var, "accumulator")
+    return training_ops.apply_proximal_adagrad(
+        var, acc, self._learning_rate_tensor,
+        self._l1_regularization_strength_tensor,
+        self._l2_regularization_strength_tensor,
+        grad, use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var):
+    acc = self.get_slot(var, "accumulator")
+    return training_ops.sparse_apply_proximal_adagrad(
+        var, acc, self._learning_rate_tensor,
+        self._l1_regularization_strength_tensor,
+        self._l2_regularization_strength_tensor,
+        grad.values, grad.indices,
+        use_locking=self._use_locking)
diff --git a/tensorflow/python/training/proximal_adagrad_test.py b/tensorflow/python/training/proximal_adagrad_test.py
new file mode 100644
index 00000000000..30e6245ef24
--- /dev/null
+++ b/tensorflow/python/training/proximal_adagrad_test.py
@@ -0,0 +1,205 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functional tests for Proximal Adagrad operations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+
+class ProximalAdagradOptimizerTest(tf.test.TestCase):
+
+  def testProximalAdagradwithoutRegularization(self):
+    with self.test_session() as sess:
+      var0 = tf.Variable([0.0, 0.0])
+      var1 = tf.Variable([0.0, 0.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+      opt = tf.train.ProximalAdagradOptimizer(3.0,
+                                              initial_accumulator_value=0.1,
+                                              l1_regularization_strength=0.0,
+                                              l2_regularization_strength=0.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose([0.0, 0.0], v0_val)
+      self.assertAllClose([0.0, 0.0], v1_val)
+
+      # Run 3 steps Proximal Adagrad.
+      for _ in range(3):
+        update.run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose(np.array([-2.60260963, -4.29698515]),
+                          v0_val)
+      self.assertAllClose(np.array([-0.28432083, -0.56694895]),
+                          v1_val)
+
+  def testProximalAdagradwithoutRegularization2(self):
+    with self.test_session() as sess:
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([4.0, 3.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+
+      opt = tf.train.ProximalAdagradOptimizer(3.0,
+                                              initial_accumulator_value=0.1,
+                                              l1_regularization_strength=0.0,
+                                              l2_regularization_strength=0.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([4.0, 3.0], v1_val)
+
+      # Run 3 steps Proximal Adagrad.
+      for _ in range(3):
+        update.run()
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose(np.array([-1.60261, -2.296985]),
+                          v0_val)
+      self.assertAllClose(np.array([3.715679, 2.433051]),
+                          v1_val)
+
+  def testProximalAdagradWithL1(self):
+    with self.test_session() as sess:
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([4.0, 3.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+
+      opt = tf.train.ProximalAdagradOptimizer(3.0,
+                                              initial_accumulator_value=0.1,
+                                              l1_regularization_strength=0.001,
+                                              l2_regularization_strength=0.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([4.0, 3.0], v1_val)
+
+      # Run 10 steps Proximal Adagrad
+      for _ in range(10):
+        update.run()
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose(np.array([0.662907, 0.767398]),
+                          v0_val)
+      self.assertAllClose(np.array([2.959304, 1.029232]),
+                          v1_val)
+
+  def testProximalAdagradWithL1_L2(self):
+    with self.test_session() as sess:
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([4.0, 3.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+
+      opt = tf.train.ProximalAdagradOptimizer(3.0,
+                                              initial_accumulator_value=0.1,
+                                              l1_regularization_strength=0.001,
+                                              l2_regularization_strength=2.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([4.0, 3.0], v1_val)
+
+      # Run 10 steps Proximal Adagrad.
+      for _ in range(10):
+        update.run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose(np.array([0.043069, 0.080461]),
+                          v0_val)
+      self.assertAllClose(np.array([0.004069, 0.008578]),
+                          v1_val)
+
+  def applyOptimizer(self, opt, steps=5, is_sparse=False):
+    if is_sparse:
+      var0 = tf.Variable([[1.0], [2.0]])
+      var1 = tf.Variable([[3.0], [4.0]])
+      grads0 = tf.IndexedSlices(tf.constant([0.1], shape=[1, 1]),
+                                tf.constant([0]),
+                                tf.constant([2, 1]))
+      grads1 = tf.IndexedSlices(tf.constant([0.02], shape=[1, 1]),
+                                tf.constant([1]),
+                                tf.constant([2, 1]))
+    else:
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([3.0, 4.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+
+    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    tf.initialize_all_variables().run()
+
+    sess = tf.get_default_session()
+    v0_val, v1_val = sess.run([var0, var1])
+    if is_sparse:
+      self.assertAllClose([[1.0], [2.0]], v0_val)
+      self.assertAllClose([[3.0], [4.0]], v1_val)
+    else:
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([3.0, 4.0], v1_val)
+
+    # Run ProximalAdagrad for a few steps
+    for _ in range(steps):
+      update.run()
+
+    v0_val, v1_val = sess.run([var0, var1])
+    return v0_val, v1_val
+
+  def testEquivAdagradwithoutRegularization(self):
+    with self.test_session():
+      val0, val1 = self.applyOptimizer(
+          tf.train.ProximalAdagradOptimizer(3.0,
+                                            initial_accumulator_value=0.1,
+                                            l1_regularization_strength=0.0,
+                                            l2_regularization_strength=0.0))
+
+    with self.test_session():
+      val2, val3 = self.applyOptimizer(
+          tf.train.AdagradOptimizer(3.0, initial_accumulator_value=0.1))
+
+    self.assertAllClose(val0, val2)
+    self.assertAllClose(val1, val3)
+
+  def testEquivSparseAdagradwithoutRegularization(self):
+    with self.test_session():
+      val0, val1 = self.applyOptimizer(
+          tf.train.ProximalAdagradOptimizer(3.0,
+                                            initial_accumulator_value=0.1,
+                                            l1_regularization_strength=0.0,
+                                            l2_regularization_strength=0.0),
+          is_sparse=True)
+
+    with self.test_session():
+      val2, val3 = self.applyOptimizer(
+          tf.train.AdagradOptimizer(3.0, initial_accumulator_value=0.1),
+          is_sparse=True)
+
+    self.assertAllClose(val0, val2)
+    self.assertAllClose(val1, val3)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/training/proximal_gradient_descent.py b/tensorflow/python/training/proximal_gradient_descent.py
new file mode 100644
index 00000000000..299c6fa1c7b
--- /dev/null
+++ b/tensorflow/python/training/proximal_gradient_descent.py
@@ -0,0 +1,81 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""ProximalGradientDescent for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+# pylint: disable=unused-import
+from tensorflow.python.ops import math_ops
+# pylint: enable=unused-import
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class ProximalGradientDescentOptimizer(optimizer.Optimizer):
+  # pylint: disable=line-too-long
+  """Optimizer that implements the proximal gradient descent algorithm.
+
+  See this [paper](http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
+
+  @@__init__
+  """
+
+  def __init__(self, learning_rate, l1_regularization_strength=0.0,
+               l2_regularization_strength=0.0, use_locking=False,
+               name="ProximalGradientDescent"):
+    """Construct a new proximal gradient descent optimizer.
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning
+        rate to use.
+      l1_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      l2_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      use_locking: If True use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "GradientDescent".
+    """
+    super(ProximalGradientDescentOptimizer, self).__init__(use_locking, name)
+    self._learning_rate = learning_rate
+    self._l1_regularization_strength = l1_regularization_strength
+    self._l2_regularization_strength = l2_regularization_strength
+    self._l1_regularization_strength_tensor = None
+    self._l2_regularization_strength_tensor = None
+
+  def _apply_dense(self, grad, var):
+    return training_ops.apply_proximal_gradient_descent(
+        var,
+        self._learning_rate_tensor,
+        self._l1_regularization_strength_tensor,
+        self._l2_regularization_strength_tensor,
+        grad,
+        use_locking=self._use_locking).op
+
+  def _apply_sparse(self, grad, var):
+    delta = ops.IndexedSlices(grad.values * self._learning_rate_tensor,
+                              grad.indices, grad.dense_shape)
+    return var.scatter_sub(delta, use_locking=self._use_locking)
+
+  def _prepare(self):
+    self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
+                                                       name="learning_rate")
+    self._l1_regularization_strength_tensor = ops.convert_to_tensor(
+        self._l1_regularization_strength, name="l1_regularization_strength")
+    self._l2_regularization_strength_tensor = ops.convert_to_tensor(
+        self._l2_regularization_strength, name="l2_regularization_strength")
diff --git a/tensorflow/python/training/proximal_gradient_descent_test.py b/tensorflow/python/training/proximal_gradient_descent_test.py
new file mode 100644
index 00000000000..4dd02526873
--- /dev/null
+++ b/tensorflow/python/training/proximal_gradient_descent_test.py
@@ -0,0 +1,178 @@
+# Copyright 2015 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functional tests for Proximal Gradient Descent operations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+
+class ProximalGradientDescentOptimizerTest(tf.test.TestCase):
+
+  def testProximalGradientDescentwithoutRegularization(self):
+    with self.test_session() as sess:
+      var0 = tf.Variable([0.0, 0.0])
+      var1 = tf.Variable([0.0, 0.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+      opt = tf.train.ProximalGradientDescentOptimizer(
+          3.0,
+          l1_regularization_strength=0.0,
+          l2_regularization_strength=0.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose([0.0, 0.0], v0_val)
+      self.assertAllClose([0.0, 0.0], v1_val)
+
+      # Run 3 steps Proximal Gradient Descent.
+      for _ in range(3):
+        update.run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose(np.array([-0.9, -1.8]),
+                          v0_val)
+      self.assertAllClose(np.array([-0.09, -0.18]),
+                          v1_val)
+
+  def testProximalGradientDescentwithoutRegularization2(self):
+    with self.test_session() as sess:
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([4.0, 3.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+
+      opt = tf.train.ProximalGradientDescentOptimizer(
+          3.0,
+          l1_regularization_strength=0.0,
+          l2_regularization_strength=0.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([4.0, 3.0], v1_val)
+
+      # Run 3 steps Proximal Gradient Descent
+      for _ in range(3):
+        update.run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose(np.array([0.1, 0.2]),
+                          v0_val)
+      self.assertAllClose(np.array([3.91, 2.82]),
+                          v1_val)
+
+  def testProximalGradientDescentWithL1_L2(self):
+    with self.test_session() as sess:
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([4.0, 3.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+
+      opt = tf.train.ProximalGradientDescentOptimizer(
+          3.0,
+          l1_regularization_strength=0.001,
+          l2_regularization_strength=2.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      tf.initialize_all_variables().run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([4.0, 3.0], v1_val)
+
+      # Run 10 steps Proximal Gradient Descent
+      for _ in range(10):
+        update.run()
+
+      v0_val, v1_val = sess.run([var0, var1])
+      self.assertAllClose(np.array([0.037125, 0.074625]),
+                          v0_val)
+      self.assertAllClose(np.array([0.003375, 0.007125]),
+                          v1_val)
+
+  def applyOptimizer(self, opt, steps=5, is_sparse=False):
+    if is_sparse:
+      var0 = tf.Variable([[1.0], [2.0]])
+      var1 = tf.Variable([[3.0], [4.0]])
+      grads0 = tf.IndexedSlices(tf.constant([0.1], shape=[1, 1]),
+                                tf.constant([0]),
+                                tf.constant([2, 1]))
+      grads1 = tf.IndexedSlices(tf.constant([0.02], shape=[1, 1]),
+                                tf.constant([1]),
+                                tf.constant([2, 1]))
+    else:
+      var0 = tf.Variable([1.0, 2.0])
+      var1 = tf.Variable([3.0, 4.0])
+      grads0 = tf.constant([0.1, 0.2])
+      grads1 = tf.constant([0.01, 0.02])
+
+    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    tf.initialize_all_variables().run()
+
+    sess = tf.get_default_session()
+    v0_val, v1_val = sess.run([var0, var1])
+    if is_sparse:
+      self.assertAllClose([[1.0], [2.0]], v0_val)
+      self.assertAllClose([[3.0], [4.0]], v1_val)
+    else:
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([3.0, 4.0], v1_val)
+
+    # Run ProximalAdagrad for a few steps
+    for _ in range(steps):
+      update.run()
+
+    v0_val, v1_val = sess.run([var0, var1])
+    return v0_val, v1_val
+
+  def testEquivSparseGradientDescentwithoutRegularizaion(self):
+    with self.test_session():
+      val0, val1 = self.applyOptimizer(
+          tf.train.ProximalGradientDescentOptimizer(
+              3.0,
+              l1_regularization_strength=0.0,
+              l2_regularization_strength=0.0),
+          is_sparse=True)
+
+    with self.test_session():
+      val2, val3 = self.applyOptimizer(
+          tf.train.GradientDescentOptimizer(3.0), is_sparse=True)
+
+    self.assertAllClose(val0, val2)
+    self.assertAllClose(val1, val3)
+
+  def testEquivGradientDescentwithoutRegularizaion(self):
+    with self.test_session():
+      val0, val1 = self.applyOptimizer(
+          tf.train.ProximalGradientDescentOptimizer(
+              3.0,
+              l1_regularization_strength=0.0,
+              l2_regularization_strength=0.0))
+
+    with self.test_session():
+      val2, val3 = self.applyOptimizer(
+          tf.train.GradientDescentOptimizer(3.0))
+
+    self.assertAllClose(val0, val2)
+    self.assertAllClose(val1, val3)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index fecc0e0c00f..84ffd6c1dad 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -151,6 +151,7 @@ from tensorflow.python.ops import state_ops
 
 from tensorflow.python.training.adadelta import AdadeltaOptimizer
 from tensorflow.python.training.adagrad import AdagradOptimizer
+from tensorflow.python.training.proximal_adagrad import ProximalAdagradOptimizer
 from tensorflow.python.training.adam import AdamOptimizer
 from tensorflow.python.training.ftrl import FtrlOptimizer
 from tensorflow.python.training.momentum import MomentumOptimizer
@@ -158,6 +159,7 @@ from tensorflow.python.training.moving_averages import ExponentialMovingAverage
 from tensorflow.python.training.optimizer import Optimizer
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 from tensorflow.python.training.gradient_descent import GradientDescentOptimizer
+from tensorflow.python.training.proximal_gradient_descent import ProximalGradientDescentOptimizer
 from tensorflow.python.training.sync_replicas_optimizer import SyncReplicasOptimizer
 
 # Utility classes for training.
diff --git a/tensorflow/python/training/training_ops.py b/tensorflow/python/training/training_ops.py
index 46955e43c56..86197523387 100644
--- a/tensorflow/python/training/training_ops.py
+++ b/tensorflow/python/training/training_ops.py
@@ -69,6 +69,17 @@ def _ApplyAdagradShape(op):
   grad_shape = op.inputs[3].get_shape().merge_with(accum_shape)
   return [grad_shape]
 
+@ops.RegisterShape("ApplyProximalAdagrad")
+def _ApplyProximalAdagradShape(op):
+  """Shape function for the ApplyProximalAdagrad op."""
+  var_shape = op.inputs[0].get_shape()
+  accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
+  _AssertInputIsScalar(op, 2)  # lr
+  _AssertInputIsScalar(op, 3)  # l1
+  _AssertInputIsScalar(op, 4)  # l2
+  grad_shape = op.inputs[5].get_shape().merge_with(accum_shape)
+  return [grad_shape]
+
 
 @ops.RegisterShape("ApplyFtrl")
 def _ApplyFtrlShape(op):
@@ -133,6 +144,32 @@ def _ApplyGradientDescentShape(op):
   delta_shape = op.inputs[2].get_shape().merge_with(var_shape)
   return [delta_shape]
 
+
+@ops.RegisterShape("ApplyProximalGradientDescent")
+def _ApplyProximalGradientDescentShape(op):
+  """Shape function for the ApplyProximalGradientDescent op."""
+  var_shape = op.inputs[0].get_shape()
+  _AssertInputIsScalar(op, 1)  # alpha
+  _AssertInputIsScalar(op, 2)  # l1
+  _AssertInputIsScalar(op, 3)  # l2
+  delta_shape = op.inputs[4].get_shape().merge_with(var_shape)
+  return [delta_shape]
+
+
+@ops.RegisterShape("SparseApplyProximalGradientDescent")
+def _SparseApplyProximalGradientDescentShape(op):
+  """Shape function for the SparseApplyGradientDescent op."""
+  var_shape = op.inputs[0].get_shape()
+  _AssertInputIsScalar(op, 1)  # lr
+  _AssertInputIsScalar(op, 2)  # l1
+  _AssertInputIsScalar(op, 3)  # l2
+  grad_shape = op.inputs[4].get_shape().merge_with(
+      tensor_shape.TensorShape([None]).concatenate(var_shape[1:]))
+  unused_indices_shape = op.inputs[5].get_shape().merge_with(
+      tensor_shape.vector(grad_shape[0]))
+  return [var_shape]
+
+
 @ops.RegisterShape("SparseApplyAdadelta")
 def _SparseApplyAdadeltaShape(op):
    """Shape function for the SparseApplyAdadelta op."""
@@ -148,6 +185,7 @@ def _SparseApplyAdadeltaShape(op):
        tensor_shape.vector(grad_shape[0]))
    return [accum_update_shape]
 
+
 @ops.RegisterShape("SparseApplyAdagrad")
 def _SparseApplyAdagradShape(op):
   """Shape function for the SparseApplyAdagrad op."""
@@ -161,6 +199,21 @@ def _SparseApplyAdagradShape(op):
   return [accum_shape]
 
 
+@ops.RegisterShape("SparseApplyProximalAdagrad")
+def _SparseApplyProximalAdagradShape(op):
+  """Shape function for the SparseApplyProximalAdagrad op."""
+  var_shape = op.inputs[0].get_shape()
+  accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
+  _AssertInputIsScalar(op, 2)  # lr
+  _AssertInputIsScalar(op, 3)  # l1
+  _AssertInputIsScalar(op, 4)  # l2
+  grad_shape = op.inputs[5].get_shape().merge_with(
+      tensor_shape.TensorShape([None]).concatenate(accum_shape[1:]))
+  unused_indices_shape = op.inputs[6].get_shape().merge_with(
+      tensor_shape.vector(grad_shape[0]))
+  return [accum_shape]
+
+
 @ops.RegisterShape("SparseApplyFtrl")
 def _SparseApplyFtrlShape(op):
   """Shape function for the SparseApplyFtrl op."""
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 23a8066e796..9d860e59a29 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1876,6 +1876,40 @@ bool CudnnSupport::DoPoolForward(
   return true;
 }
 
+bool CudnnSupport::DoPoolForward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<Eigen::half>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    DeviceMemory<Eigen::half>* output_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                        AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
+    return false;
+  }
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_HALF};
+  ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
+  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
+  status = dynload::cudnnPoolingForward(
+      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
+      output_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to enqueue forward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
 bool CudnnSupport::DoPoolBackward(
     Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions,
@@ -1914,6 +1948,43 @@ bool CudnnSupport::DoPoolBackward(
   return true;
 }
 
+bool CudnnSupport::DoPoolBackward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<Eigen::half>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    const DeviceMemory<Eigen::half>& output_data,
+    const DeviceMemory<Eigen::half>& input_diff_data,
+    DeviceMemory<Eigen::half>* output_diff_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                        AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
+    return false;
+  }
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_HALF};
+  ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
+  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
+  status = dynload::cudnnPoolingBackward(
+      parent_, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
+      dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
+      input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
+      src_desc.handle(), output_diff_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
 bool CudnnSupport::DoNormalize(
     Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
     const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 523a0c6c5d3..434ab730a78 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -201,6 +201,13 @@ class CudnnSupport : public dnn::DnnSupport {
                      const dnn::BatchDescriptor& output_dimensions,
                      DeviceMemory<float>* output_data) override;
 
+  bool DoPoolForward(Stream* stream,
+                     const dnn::PoolingDescriptor& pooling_dimensions,
+                     const dnn::BatchDescriptor& input_dimensions,
+                     const DeviceMemory<Eigen::half>& input_data,
+                     const dnn::BatchDescriptor& output_dimensions,
+                     DeviceMemory<Eigen::half>* output_data) override;
+
   bool DoPoolBackward(Stream* stream,
                       const dnn::PoolingDescriptor& pooling_dimensions,
                       const dnn::BatchDescriptor& input_dimensions,
@@ -210,6 +217,15 @@ class CudnnSupport : public dnn::DnnSupport {
                       const DeviceMemory<float>& input_diff_data,
                       DeviceMemory<float>* output_diff_data) override;
 
+  bool DoPoolBackward(Stream* stream,
+                      const dnn::PoolingDescriptor& pooling_dimensions,
+                      const dnn::BatchDescriptor& input_dimensions,
+                      const DeviceMemory<Eigen::half>& input_data,
+                      const dnn::BatchDescriptor& output_dimensions,
+                      const DeviceMemory<Eigen::half>& output_data,
+                      const DeviceMemory<Eigen::half>& input_diff_data,
+                      DeviceMemory<Eigen::half>* output_diff_data) override;
+
   bool DoNormalize(Stream* stream,
                    const dnn::NormalizeDescriptor& normalize_descriptor,
                    const DeviceMemory<float>& input_data,
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index fbb44dc7390..0ae482a73c4 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -1011,6 +1011,13 @@ class DnnSupport {
                              const dnn::BatchDescriptor& output_dimensions,
                              DeviceMemory<float>* output_data) = 0;
 
+  virtual bool DoPoolForward(Stream* stream,
+                             const dnn::PoolingDescriptor& pooling_dimensions,
+                             const dnn::BatchDescriptor& input_dimensions,
+                             const DeviceMemory<Eigen::half>& input_data,
+                             const dnn::BatchDescriptor& output_dimensions,
+                             DeviceMemory<Eigen::half>* output_data) = 0;
+
   // Performs differentiation of the pooling operation.
   virtual bool DoPoolBackward(Stream* stream,
                               const dnn::PoolingDescriptor& pooling_dimensions,
@@ -1021,6 +1028,15 @@ class DnnSupport {
                               const DeviceMemory<float>& input_diff_data,
                               DeviceMemory<float>* output_diff_data) = 0;
 
+  virtual bool DoPoolBackward(Stream* stream,
+                              const dnn::PoolingDescriptor& pooling_dimensions,
+                              const dnn::BatchDescriptor& input_dimensions,
+                              const DeviceMemory<Eigen::half>& input_data,
+                              const dnn::BatchDescriptor& output_dimensions,
+                              const DeviceMemory<Eigen::half>& output_data,
+                              const DeviceMemory<Eigen::half>& input_diff_data,
+                              DeviceMemory<Eigen::half>* output_diff_data) = 0;
+
   // Applies local response normalization to the values from
   // input_data and writes the result to output_data. See comments on
   // NormalizeDescriptor for a description of local response
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 446a3c9a7d1..be823d9500f 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -909,6 +909,30 @@ Stream &Stream::ThenPoolForward(
   return *this;
 }
 
+Stream &Stream::ThenPoolForward(
+    const dnn::PoolingDescriptor &pooling_dimensions,
+    const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<Eigen::half> &input_data,
+    const dnn::BatchDescriptor &output_dimensions,
+    DeviceMemory<Eigen::half> *output_data) {
+  VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions),
+            PARAM(input_data), PARAM(output_dimensions), PARAM(output_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                    input_data, output_dimensions,
+                                    output_data));
+    } else {
+      SetError();
+      LOG(WARNING)
+          << "attempting to perform DNN operation using StreamExecutor "
+             "without DNN support";
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenPoolBackward(
     const dnn::PoolingDescriptor &pooling_dimensions,
     const dnn::BatchDescriptor &input_dimensions,
@@ -936,6 +960,33 @@ Stream &Stream::ThenPoolBackward(
   return *this;
 }
 
+Stream &Stream::ThenPoolBackward(
+    const dnn::PoolingDescriptor &pooling_dimensions,
+    const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<Eigen::half> &input_data,
+    const dnn::BatchDescriptor &output_dimensions,
+    const DeviceMemory<Eigen::half> &output_data,
+    const DeviceMemory<Eigen::half> &input_diff_data,
+    DeviceMemory<Eigen::half> *output_diff_data) {
+  VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions),
+            PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
+            PARAM(input_diff_data), PARAM(output_diff_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
+                                     input_data, output_dimensions, output_data,
+                                     input_diff_data, output_diff_data));
+    } else {
+      SetError();
+      LOG(WARNING)
+          << "attempting to perform DNN operation using StreamExecutor "
+             "without DNN support";
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenNormalize(
     const dnn::NormalizeDescriptor &normalize_descriptor,
     const DeviceMemory<float> &input_data, DeviceMemory<float> *output_data) {
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index aac945c9e02..c131250de1e 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -421,6 +421,12 @@ class Stream {
                           const dnn::BatchDescriptor &output_dimensions,
                           DeviceMemory<float> *output_data);
 
+  Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
+                          const dnn::BatchDescriptor &input_dimensions,
+                          const DeviceMemory<Eigen::half> &input_data,
+                          const dnn::BatchDescriptor &output_dimensions,
+                          DeviceMemory<Eigen::half> *output_data);
+
   Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
                            const dnn::BatchDescriptor &input_dimensions,
                            const DeviceMemory<float> &input_data,
@@ -429,6 +435,14 @@ class Stream {
                            const DeviceMemory<float> &input_diff_data,
                            DeviceMemory<float> *output_diff_data);
 
+  Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
+                           const dnn::BatchDescriptor &input_dimensions,
+                           const DeviceMemory<Eigen::half> &input_data,
+                           const dnn::BatchDescriptor &output_dimensions,
+                           const DeviceMemory<Eigen::half> &output_data,
+                           const DeviceMemory<Eigen::half> &input_diff_data,
+                           DeviceMemory<Eigen::half> *output_diff_data);
+
   Stream &ThenNormalize(const dnn::NormalizeDescriptor &normalize_descriptor,
                         const DeviceMemory<float> &input_data,
                         DeviceMemory<float> *output_data);
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 07f83651e02..d9cfb85fc36 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -6,8 +6,8 @@
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
     name = "eigen_archive",
-    url = "https://bitbucket.org/eigen/eigen/get/d02e6a705c30.tar.gz",
-    sha256 = "532956172daa8aba87c750791ff89a5c38cdb07e2525afe17ecb4bef812d67cf",
+    url = "https://bitbucket.org/eigen/eigen/get/0c0b79ecd74c.tar.gz",
+    sha256 = "b4b5884b03bd4bae114d02b36e2435ad1504ed8e51431d16c876b6f6a365882b",
     build_file = path_prefix + "eigen.BUILD",
   )
 
diff --git a/third_party/eigen3/Eigen/Cholesky b/third_party/eigen3/Eigen/Cholesky
index 56059bcc61c..7415ae4d0d5 100644
--- a/third_party/eigen3/Eigen/Cholesky
+++ b/third_party/eigen3/Eigen/Cholesky
@@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/Eigen/Cholesky"
+#include "eigen-eigen-0c0b79ecd74c/Eigen/Cholesky"
diff --git a/third_party/eigen3/Eigen/Core b/third_party/eigen3/Eigen/Core
index c1d4a2e0f8c..787e1c076ea 100644
--- a/third_party/eigen3/Eigen/Core
+++ b/third_party/eigen3/Eigen/Core
@@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/Eigen/Core"
+#include "eigen-eigen-0c0b79ecd74c/Eigen/Core"
diff --git a/third_party/eigen3/Eigen/Eigenvalues b/third_party/eigen3/Eigen/Eigenvalues
index 0a0731ba19b..b6e1b81eb5b 100644
--- a/third_party/eigen3/Eigen/Eigenvalues
+++ b/third_party/eigen3/Eigen/Eigenvalues
@@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/Eigen/Eigenvalues"
+#include "eigen-eigen-0c0b79ecd74c/Eigen/Eigenvalues"
diff --git a/third_party/eigen3/Eigen/LU b/third_party/eigen3/Eigen/LU
index d6b39b8d235..a0782af0405 100644
--- a/third_party/eigen3/Eigen/LU
+++ b/third_party/eigen3/Eigen/LU
@@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/Eigen/LU"
+#include "eigen-eigen-0c0b79ecd74c/Eigen/LU"
diff --git a/third_party/eigen3/Eigen/QR b/third_party/eigen3/Eigen/QR
index a5406e93bc6..0a9bee2898f 100644
--- a/third_party/eigen3/Eigen/QR
+++ b/third_party/eigen3/Eigen/QR
@@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/Eigen/QR"
+#include "eigen-eigen-0c0b79ecd74c/Eigen/QR"
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
index 4f730236b78..5228bcda62e 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -1 +1 @@
-#include "eigen-eigen-d02e6a705c30/unsupported/Eigen/CXX11/Tensor"
+#include "eigen-eigen-0c0b79ecd74c/unsupported/Eigen/CXX11/Tensor"
diff --git a/tools/bazel.rc.template b/tools/bazel.rc.template
index d4dddb5211f..02856822c95 100644
--- a/tools/bazel.rc.template
+++ b/tools/bazel.rc.template
@@ -2,6 +2,7 @@ build:cuda --crosstool_top=//third_party/gpus/crosstool
 build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
 
 build --force_python=py$PYTHON_MAJOR_VERSION
+build --host_force_python=py$PYTHON_MAJOR_VERSION
 build --python$PYTHON_MAJOR_VERSION_path=$PYTHON_BINARY
 build --define=use_fast_cpp_protos=true
 build --define=allow_oversize_protos=true