diff --git a/README.md b/README.md
index e356ff931ce..2ea8aef02a4 100644
--- a/README.md
+++ b/README.md
@@ -33,9 +33,9 @@ and discussion.**
 
 People who are a little more adventurous can also try our nightly binaries:
 
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/140/artifact/pip_test/whl/tensorflow-0.8.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/))
 * Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
 * [Android](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/lastSuccessfulBuild/artifact/bazel-out/local_linux/bin/tensorflow/examples/android/tensorflow_demo.apk) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/))
 
diff --git a/configure b/configure
index f4b772c55ef..ce75bb490a7 100755
--- a/configure
+++ b/configure
@@ -2,6 +2,11 @@
 
 DO_NOT_SUBMIT_WARNING="Unofficial setting. DO NOT SUBMIT!!!"
 
+# Find out the absolute path to where ./configure resides
+pushd `dirname $0` #> /dev/null
+SOURCE_BASE_DIR=`pwd -P`
+popd > /dev/null
+
 ## Set up python-related environment settings
 while true; do
   fromuser=""
@@ -68,6 +73,12 @@ echo "$SWIG_PATH" > tensorflow/tools/swig/swig_path
 # Invoke python_config and set up symlinks to python includes
 (./util/python/python_config.sh --setup "$PYTHON_BIN_PATH";) || exit -1
 
+# Run the gen_git_source to create links where bazel can track dependencies for
+# git hash propagation
+GEN_GIT_SOURCE=tensorflow/tools/git/gen_git_source.py
+chmod a+x ${GEN_GIT_SOURCE}
+${PYTHON_BIN_PATH} ${GEN_GIT_SOURCE} --configure ${SOURCE_BASE_DIR}
+
 ## Set up Cuda-related environment settings
 
 while [ "$TF_NEED_CUDA" == "" ]; do
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index e8391c0f70a..998ed0b12c3 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -176,6 +176,7 @@ filegroup(
         "//tensorflow/tools/docker:all_files",
         "//tensorflow/tools/docker/notebooks:all_files",
         "//tensorflow/tools/docs:all_files",
+        "//tensorflow/tools/git:all_files",
         "//tensorflow/tools/proto_text:all_files",
         "//tensorflow/tools/test:all_files",
         "//tensorflow/user_ops:all_files",
diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index 37ce250843c..84bae2f768a 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -89,6 +89,18 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "stochastic_gradient_estimators_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/stochastic_gradient_estimators_test.py"],
+    additional_deps = [
+        ":bayesflow_py",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "reinforce_simple_example",
     size = "small",
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_gradient_estimators_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_gradient_estimators_test.py
new file mode 100644
index 00000000000..56936e6c38c
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_gradient_estimators_test.py
@@ -0,0 +1,92 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for stochastic graphs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+st = tf.contrib.bayesflow.stochastic_tensor
+sge = tf.contrib.bayesflow.stochastic_gradient_estimators
+
+
+class StochasticGradientEstimatorsTest(tf.test.TestCase):
+
+  def setUp(self):
+    self._p = tf.constant(0.999999)
+    self._final_loss = tf.constant(3.2)
+
+  def _testScoreFunction(self, loss_fn, expected):
+    x = st.BernoulliTensor(p=self._p, loss_fn=loss_fn)
+    sf = x.loss(self._final_loss)
+    with self.test_session() as sess:
+      sess.run(tf.initialize_all_variables())
+      self.assertAllClose(*sess.run([expected, sf]))
+
+  def testScoreFunction(self):
+    expected = tf.log(self._p) * self._final_loss
+    self._testScoreFunction(sge.score_function, expected)
+
+  def testScoreFunctionWithConstantBaseline(self):
+    b = tf.constant(9.8)
+    expected = tf.log(self._p) * (self._final_loss - b)
+    self._testScoreFunction(
+        sge.get_score_function_with_constant_baseline(b), expected)
+
+  def testScoreFunctionWithBaselineFn(self):
+    b = tf.constant(9.8)
+
+    def baseline_fn(stoch_tensor, loss):
+      self.assertTrue(isinstance(stoch_tensor, st.StochasticTensor))
+      self.assertTrue(isinstance(loss, tf.Tensor))
+      return b
+
+    expected = tf.log(self._p) * (self._final_loss - b)
+    self._testScoreFunction(
+        sge.get_score_function_with_baseline(baseline_fn), expected)
+
+  def testScoreFunctionWithMeanBaseline(self):
+    ema_decay = 0.8
+    x = st.BernoulliTensor(
+        p=self._p,
+        loss_fn=sge.get_score_function_with_baseline(
+            sge.get_mean_baseline(ema_decay)))
+    sf = x.loss(self._final_loss)
+
+    expected = tf.log(self._p) * (self._final_loss -
+                                  (1. - ema_decay) * self._final_loss)
+
+    with self.test_session() as sess:
+      sess.run(tf.initialize_all_variables())
+      sess.run(sf)  # run to update EMA
+      self.assertAllClose(*sess.run([expected, sf]))
+
+  def testScoreFunctionWithAdvantageFn(self):
+    b = tf.constant(9.8)
+
+    def advantage_fn(stoch_tensor, loss):
+      self.assertTrue(isinstance(stoch_tensor, st.StochasticTensor))
+      self.assertTrue(isinstance(loss, tf.Tensor))
+      return loss - b
+
+    expected = tf.log(self._p) * (self._final_loss - b)
+    self._testScoreFunction(
+        sge.get_score_function_with_advantage(advantage_fn), expected)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py b/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py
index 62b8446131c..7cb8ef06f93 100644
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py
@@ -58,7 +58,6 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import training
 from tensorflow.python.util.all_util import make_all
@@ -180,11 +179,12 @@ def get_mean_baseline(ema_decay=0.99, name="MeanBaseline"):
   def mean_baseline(_, loss):
     with ops.name_scope(name):
       ema = training.ExponentialMovingAverage(decay=ema_decay)
-      update_op = ema.apply(math_ops.reduce_mean(loss))
-      with control_flow_ops.control_dependencies([update_op]):
+      reduced_loss = math_ops.reduce_mean(loss)
+      update_op = ema.apply([reduced_loss])
+      with ops.control_dependencies([update_op]):
         # TODO(rsepassi): Possibly implement the initialization bias correction
         # term from Adam (section 3 of https://arxiv.org/pdf/1412.6980v8.pdf).
-        baseline = ema.average(loss)
+        baseline = ema.average(reduced_loss)
       return baseline
 
   return mean_baseline
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 3965f843635..89ad766e112 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -177,6 +177,21 @@ add_dependencies(tf_core_lib
     boringssl
 )
 
+# Tricky setup to force always rebuilding
+# force_rebuild always runs forcing ${VERSION_INFO_CC} target to run
+# ${VERSION_INFO_CC} would cache, but it depends on a phony never produced
+# target.
+set(VERSION_INFO_CC ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
+add_custom_target(force_rebuild_target ALL DEPENDS ${VERSION_INFO_CC})
+add_custom_command(OUTPUT __force_rebuild COMMAND cmake -E echo)
+add_custom_command(OUTPUT
+    ${VERSION_INFO_CC}
+    COMMAND ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
+    --raw_generate ${VERSION_INFO_CC}
+    DEPENDS __force_rebuild)
+
+set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
+
 
 ########################################################
 # tf_core_framework library
@@ -208,6 +223,7 @@ list(REMOVE_ITEM tf_core_framework_srcs ${tf_core_framework_test_srcs})
 
 add_library(tf_core_framework OBJECT
     ${tf_core_framework_srcs}
+    ${tf_version_srcs}
     ${PROTO_TEXT_HDRS}
     ${PROTO_TEXT_SRCS})
 target_include_directories(tf_core_framework PUBLIC
diff --git a/tensorflow/contrib/learn/__init__.py b/tensorflow/contrib/learn/__init__.py
index 84c5a356710..6287cab9b2b 100644
--- a/tensorflow/contrib/learn/__init__.py
+++ b/tensorflow/contrib/learn/__init__.py
@@ -25,8 +25,6 @@ Train and evaluate TensorFlow models.
 @@ModeKeys
 @@DNNClassifier
 @@DNNRegressor
-@@TensorFlowDNNClassifier
-@@TensorFlowDNNRegressor
 @@TensorFlowEstimator
 @@LinearClassifier
 @@LinearRegressor
diff --git a/tensorflow/contrib/learn/python/learn/estimators/__init__.py b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
index 384f2ba53b2..3b272f017ed 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
@@ -26,8 +26,6 @@ from tensorflow.contrib.learn.python.learn.estimators.base import TensorFlowEsti
 from tensorflow.contrib.learn.python.learn.estimators.classifier import Classifier
 from tensorflow.contrib.learn.python.learn.estimators.dnn import DNNClassifier
 from tensorflow.contrib.learn.python.learn.estimators.dnn import DNNRegressor
-from tensorflow.contrib.learn.python.learn.estimators.dnn import TensorFlowDNNClassifier
-from tensorflow.contrib.learn.python.learn.estimators.dnn import TensorFlowDNNRegressor
 from tensorflow.contrib.learn.python.learn.estimators.dnn_linear_combined import DNNLinearCombinedClassifier
 from tensorflow.contrib.learn.python.learn.estimators.dnn_linear_combined import DNNLinearCombinedRegressor
 from tensorflow.contrib.learn.python.learn.estimators.estimator import BaseEstimator
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index ca2065b278a..16f0ab7056a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -19,9 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.learn.python.learn.estimators import _sklearn
 from tensorflow.contrib.learn.python.learn.estimators import dnn_linear_combined
-from tensorflow.contrib.learn.python.learn.estimators.base import DeprecatedMixin
 from tensorflow.python.ops import nn
 
 
@@ -279,14 +277,3 @@ class DNNRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
   @property
   def bias_(self):
     return self.dnn_bias_
-
-
-# TensorFlowDNNClassifier and TensorFlowDNNRegressor are deprecated.
-class TensorFlowDNNClassifier(DeprecatedMixin, DNNClassifier,
-                              _sklearn.ClassifierMixin):
-  pass
-
-
-class TensorFlowDNNRegressor(DeprecatedMixin, DNNRegressor,
-                             _sklearn.RegressorMixin):
-  pass
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index d33346b4f64..e0d1837e584 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -166,10 +166,16 @@ def _get_replica_device_setter(config):
   """
   ps_ops = ['Variable', 'AutoReloadVariable',
             'MutableHashTable', 'MutableHashTableOfTensors']
+
+  if config.job_name:
+    worker_device = '/job:%s/task:%d' % (config.job_name, config.task)
+  else:
+    worker_device = '/job:worker'
+
   if config.num_ps_replicas > 0:
     return device_setter.replica_device_setter(
-        ps_tasks=config.num_ps_replicas, merge_devices=False, ps_ops=ps_ops,
-        cluster=config.cluster_spec)
+        ps_tasks=config.num_ps_replicas, worker_device=worker_device,
+        merge_devices=False, ps_ops=ps_ops, cluster=config.cluster_spec)
   else:
     return None
 
@@ -653,7 +659,7 @@ class BaseEstimator(
           if not isinstance(m, session_run_hook.SessionRunHook)
       ]
 
-      supervisor_is_chief = (self._config.task == 0)
+      supervisor_is_chief = self._config.is_chief
       if not supervisor_is_chief:
         # Prune list of monitor to the ones runnable on all workers.
         deprecated_monitors = [m for m in deprecated_monitors
@@ -746,7 +752,7 @@ class BaseEstimator(
           eval_dict=eval_dict,
           update_op=update_op,
           global_step_tensor=global_step,
-          supervisor_master=self._config.master,
+          supervisor_master=self._config.evaluation_master,
           feed_fn=feed_fn,
           max_steps=steps)
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index 3ed5d9318af..c00fc213129 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -503,6 +503,20 @@ class ReplicaDeviceSetterTest(tf.test.TestCase):
     self.assertDeviceEqual('', table._table_ref.device)
     self.assertDeviceEqual('', output.device)
 
+  def testTaskIsSetOnWorkerWhenJobNameIsSet(self):
+    with tf.device(
+        estimator._get_replica_device_setter(
+            tf.contrib.learn.RunConfig(
+                num_ps_replicas=1, job_name='worker', task=3))):
+      v = tf.Variable([1, 2])
+      w = tf.Variable([2, 1])
+      a = v + w
+    self.assertDeviceEqual('/job:ps/task:0', v.device)
+    self.assertDeviceEqual('/job:ps/task:0', v.initializer.device)
+    self.assertDeviceEqual('/job:ps/task:0', w.device)
+    self.assertDeviceEqual('/job:ps/task:0', w.initializer.device)
+    self.assertDeviceEqual('/job:worker/task:3', a.device)
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index ab48e3beea8..4be50b05e14 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -49,7 +49,8 @@ class RunConfig(object):
                keep_checkpoint_max=5,
                keep_checkpoint_every_n_hours=10000,
                job_name=None,
-               is_chief=None):
+               is_chief=None,
+               evaluation_master=''):
     """Constructor.
 
     If set to None, `master`, `task`, `num_ps_replicas`, `cluster_spec`,
@@ -115,6 +116,7 @@ class RunConfig(object):
         must exist in the `cluster_spec.jobs`.
       is_chief: whether or not this task (as identified by the other parameters)
         should be the chief task.
+      evaluation_master: the master on which to perform evaluation.
 
     Raises:
       ValueError: if num_ps_replicas and cluster_spec are set (cluster_spec
@@ -145,10 +147,16 @@ class RunConfig(object):
 
     # Set is_chief.
     self._is_chief = is_chief
-    # When the TF_CONFIG environment variable is set, we can set the default
-    # of is_chief to 0 when job_name is "master" and task is 0.
-    if (self._is_chief is None) and config:
-      self._is_chief = (self._job_name == 'master' and self.task == 0)
+    if self._is_chief is None:
+      if not self._job_name:
+        self._is_chief = (self.task == 0)
+      elif config:
+        # When the TF_CONFIG environment variable is set, we can set the
+        # default of is_chief to 0 when job_name is "master" and task is 0.
+        self._is_chief = (self._job_name == 'master' and self.task == 0)
+      else:
+        # Legacy behavior is that is_chief is None if task == 0.
+        self._is_chief = (self._job_name == 'worker' and self.task == 0)
 
     # Enforce that is_chief is only applicable to workers or masters
     # (Cloud ML) with task == 0.
@@ -169,6 +177,8 @@ class RunConfig(object):
           'Master task 0 must be chief. Please check is_chief, job_name, and '
           'task, which may have been set in TF_CONFIG environment variable.')
 
+    self.evaluation_master = evaluation_master or ''
+
     gpu_options = GPUOptions(
         per_process_gpu_memory_fraction=gpu_memory_fraction)
     self.tf_config = ConfigProto(
@@ -224,7 +234,7 @@ def _get_master(cluster_spec, job_name, task_index):
           '%s\n\n'
           'Note that these value may be coming from the TF_CONFIG environment '
           'variable.' % (task_index, job_name, cluster_spec))
-    return addresses[task_index]
+    return 'grpc://' + addresses[task_index]
 
   # For backwards compatibility, we return empty string if job_name was
   # not set (job_name did not previously exist).
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index a3d851b8a42..3a612aec302 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -115,14 +115,18 @@ class Experiment(object):
       The trained estimator.
     """
     start = time.time()
+
+    # Start the server, if needed. It's important to start the server before
+    # we (optionally) sleep for the case where no device_filters are set.
+    # Otherwise, the servers will wait to connect to each other before starting
+    # to train. We might as well start as soon as we can.
+    if self._estimator.config.cluster_spec:
+      self._start_server()
+
     if delay_secs is None:
       task_id = self._estimator.config.task or 0
       delay_secs = min(60, task_id * 5)
 
-    # Start the server, if needed.
-    if self._estimator.config.cluster_spec:
-      self._start_server()
-
     if delay_secs:
       elapsed_secs = time.time() - start
       remaining = delay_secs - elapsed_secs
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner.py b/tensorflow/contrib/learn/python/learn/learn_runner.py
index f9fc9838fa2..b473b99c8f6 100644
--- a/tensorflow/contrib/learn/python/learn/learn_runner.py
+++ b/tensorflow/contrib/learn/python/learn/learn_runner.py
@@ -117,7 +117,7 @@ def _get_default_schedule(config):
     # than one masters or explicitly disallow.
     return 'local_run'
   elif config.job_name == 'ps':
-    return 'serve'
+    return 'run_std_server'
   elif config.job_name == 'worker':
     return 'train'
 
diff --git a/tensorflow/contrib/learn/python/learn/tests/early_stopping_test.py b/tensorflow/contrib/learn/python/learn/tests/early_stopping_test.py
index 2f46e7cbcdf..ba9ab5277f4 100644
--- a/tensorflow/contrib/learn/python/learn/tests/early_stopping_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/early_stopping_test.py
@@ -47,26 +47,29 @@ class EarlyStoppingTest(tf.test.TestCase):
         x_train, y_train, test_size=0.2, random_state=42)
     val_monitor = learn.monitors.ValidationMonitor(
         x_val, y_val, every_n_steps=50, early_stopping_rounds=100,
-        early_stopping_metric='accuracy', early_stopping_metric_minimize=False)
+        early_stopping_metric='loss', early_stopping_metric_minimize=False)
+
+    feature_columns = learn.infer_real_valued_columns_from_input(iris.data)
 
     # classifier without early stopping - overfitting
-    classifier1 = learn.TensorFlowDNNClassifier(
-        hidden_units=[10, 20, 10], n_classes=3, steps=1000)
-    classifier1.fit(x_train, y_train)
+    classifier1 = learn.DNNClassifier(
+        feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3)
+    classifier1.fit(x_train, y_train, steps=1000)
     _ = accuracy_score(y_test, classifier1.predict(x_test))
 
-    # Full 1000 steps, 12 summaries and no evaluation summary.
-    # 12 summaries = global_step + first + every 100 out of 1000 steps.
-    self.assertEqual(12, len(_get_summary_events(classifier1.model_dir)))
+    # Full 1000 steps, 19 summaries and no evaluation summary:
+    # 1 summary of net at step 1
+    # 9 x (1 summary of net and 1 summary of global step) for steps 101, 201,...
+    self.assertEqual(19, len(_get_summary_events(classifier1.model_dir)))
     with self.assertRaises(ValueError):
       _get_summary_events(classifier1.model_dir + '/eval')
 
     # classifier with early stopping - improved accuracy on testing set
-    classifier2 = learn.TensorFlowDNNClassifier(
-        hidden_units=[10, 20, 10], n_classes=3, steps=2000,
+    classifier2 = learn.DNNClassifier(
+        hidden_units=[10, 20, 10], feature_columns=feature_columns, n_classes=3,
         config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
 
-    classifier2.fit(x_train, y_train, monitors=[val_monitor])
+    classifier2.fit(x_train, y_train, monitors=[val_monitor], steps=2000)
     _ = accuracy_score(y_val, classifier2.predict(x_val))
     _ = accuracy_score(y_test, classifier2.predict(x_test))
 
diff --git a/tensorflow/contrib/learn/python/learn/tests/estimators_test.py b/tensorflow/contrib/learn/python/learn/tests/estimators_test.py
index 73cb96bdfb9..3231e43becb 100644
--- a/tensorflow/contrib/learn/python/learn/tests/estimators_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/estimators_test.py
@@ -41,19 +41,18 @@ class InferredfeatureColumnTest(tf.test.TestCase):
                                                         test_size=0.2,
                                                         random_state=42)
 
-    def custom_optimizer(learning_rate):
-      return tf.train.MomentumOptimizer(learning_rate, 0.9)
+    def custom_optimizer():
+      return tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9)
 
     cont_features = [
         tf.contrib.layers.real_valued_column("", dimension=4)]
-    classifier = learn.TensorFlowDNNClassifier(
+    classifier = learn.DNNClassifier(
         feature_columns=cont_features,
         hidden_units=[10, 20, 10],
         n_classes=3,
-        steps=400,
-        learning_rate=0.01,
-        optimizer=custom_optimizer)
-    classifier.fit(x_train, y_train)
+        optimizer=custom_optimizer,
+        config=learn.RunConfig(tf_random_seed=1))
+    classifier.fit(x_train, y_train, steps=400)
     score = accuracy_score(y_test, classifier.predict(x_test))
 
     self.assertGreater(score, 0.65, "Failed with score = {0}".format(score))
@@ -71,17 +70,16 @@ class CustomOptimizer(tf.test.TestCase):
                                                         test_size=0.2,
                                                         random_state=42)
 
-    def custom_optimizer(learning_rate):
-      return tf.train.MomentumOptimizer(learning_rate, 0.9)
+    def custom_optimizer():
+      return tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9)
 
-    classifier = learn.TensorFlowDNNClassifier(
+    classifier = learn.DNNClassifier(
         hidden_units=[10, 20, 10],
         feature_columns=learn.infer_real_valued_columns_from_input(x_train),
         n_classes=3,
-        steps=400,
-        learning_rate=0.01,
-        optimizer=custom_optimizer)
-    classifier.fit(x_train, y_train)
+        optimizer=custom_optimizer,
+        config=learn.RunConfig(tf_random_seed=1))
+    classifier.fit(x_train, y_train, steps=400)
     score = accuracy_score(y_test, classifier.predict(x_test))
 
     self.assertGreater(score, 0.65, "Failed with score = {0}".format(score))
diff --git a/tensorflow/contrib/learn/python/learn/tests/grid_search_test.py b/tensorflow/contrib/learn/python/learn/tests/grid_search_test.py
index e88de568892..419fd9c7538 100644
--- a/tensorflow/contrib/learn/python/learn/tests/grid_search_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/grid_search_test.py
@@ -43,12 +43,14 @@ class GridSearchTest(tf.test.TestCase):
     if HAS_SKLEARN:
       random.seed(42)
       iris = datasets.load_iris()
-      classifier = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
-                                                 n_classes=3,
-                                                 steps=50)
+      feature_columns = learn.infer_real_valued_columns_from_input(iris.data)
+      classifier = learn.DNNClassifier(
+          feature_columns=feature_columns, hidden_units=[10, 20, 10],
+          n_classes=3)
       grid_search = GridSearchCV(classifier,
-                                 {'hidden_units': [[5, 5], [10, 10]],
-                                  'learning_rate': [0.1, 0.01]})
+                                 {'hidden_units': [[5, 5], [10, 10]]},
+                                 scoring='accuracy',
+                                 fit_params={'steps': [50]})
       grid_search.fit(iris.data, iris.target)
       score = accuracy_score(iris.target, grid_search.predict(iris.data))
       self.assertGreater(score, 0.5, 'Failed with score = {0}'.format(score))
diff --git a/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py b/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py
index a916fe54acb..e7a16b002ff 100644
--- a/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/learn_runner_test.py
@@ -44,8 +44,8 @@ class TestExperiment(tf.contrib.learn.Experiment):
   def train(self):
     return "train"
 
-  def serve(self):
-    return "serve"
+  def run_std_server(self):
+    return "run_std_server"
 
   def simple_task(self):
     return "simple_task, default=%s." % self.default
@@ -92,7 +92,7 @@ class MainTest(tf.test.TestCase):
                          output_dir="/tmp"))
 
   def test_schedule_from_config_runs_train_and_evaluate_on_master(self):
-    config = run_config.RunConfig(job_name="master")
+    config = run_config.RunConfig(job_name="master", task=0, is_chief=True)
     self.assertEqual(
         "train_and_evaluate",
         learn_runner.run(lambda output_dir: TestExperiment(config=config),
@@ -101,7 +101,7 @@ class MainTest(tf.test.TestCase):
   def test_schedule_from_config_runs_serve_on_ps(self):
     config = run_config.RunConfig(job_name="ps")
     self.assertEqual(
-        "serve",
+        "run_std_server",
         learn_runner.run(lambda output_dir: TestExperiment(config=config),
                          output_dir="/tmp"))
 
diff --git a/tensorflow/contrib/learn/python/learn/tests/nonlinear_test.py b/tensorflow/contrib/learn/python/learn/tests/nonlinear_test.py
index 515f7985e31..a36e6b3dbcd 100644
--- a/tensorflow/contrib/learn/python/learn/tests/nonlinear_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/nonlinear_test.py
@@ -37,9 +37,10 @@ class NonLinearTest(tf.test.TestCase):
   def testIrisDNN(self):
     iris = tf.contrib.learn.datasets.load_iris()
     feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
-    classifier = tf.contrib.learn.TensorFlowDNNClassifier(
-        feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3)
-    classifier.fit(iris.data, iris.target)
+    classifier = tf.contrib.learn.DNNClassifier(
+        feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3,
+        config=tf.contrib.learn.RunConfig(tf_random_seed=1))
+    classifier.fit(iris.data, iris.target, max_steps=200)
     score = accuracy_score(iris.target, classifier.predict(iris.data))
     self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
     weights = classifier.weights_
@@ -53,10 +54,11 @@ class NonLinearTest(tf.test.TestCase):
   def testBostonDNN(self):
     boston = tf.contrib.learn.datasets.load_boston()
     feature_columns = [tf.contrib.layers.real_valued_column("", dimension=13)]
-    regressor = tf.contrib.learn.TensorFlowDNNRegressor(
-        feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=0,
-        batch_size=boston.data.shape[0], steps=300, learning_rate=0.01)
-    regressor.fit(boston.data, boston.target)
+    regressor = tf.contrib.learn.DNNRegressor(
+        feature_columns=feature_columns, hidden_units=[10, 20, 10],
+        config=tf.contrib.learn.RunConfig(tf_random_seed=1))
+    regressor.fit(
+        boston.data, boston.target, steps=300, batch_size=boston.data.shape[0])
     score = mean_squared_error(boston.target, regressor.predict(boston.data))
     self.assertLess(score, 110, "Failed with score = {0}".format(score))
     weights = regressor.weights_
@@ -71,10 +73,10 @@ class NonLinearTest(tf.test.TestCase):
     # Dropout prob == 0.
     iris = tf.contrib.learn.datasets.load_iris()
     feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
-    classifier = tf.contrib.learn.TensorFlowDNNClassifier(
+    classifier = tf.contrib.learn.DNNClassifier(
         feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3,
-        dropout=0.0)
-    classifier.fit(iris.data, iris.target)
+        dropout=0.0, config=tf.contrib.learn.RunConfig(tf_random_seed=1))
+    classifier.fit(iris.data, iris.target, max_steps=200)
     score = accuracy_score(iris.target, classifier.predict(iris.data))
     self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
 
@@ -82,10 +84,10 @@ class NonLinearTest(tf.test.TestCase):
     # Dropping only a little.
     iris = tf.contrib.learn.datasets.load_iris()
     feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
-    classifier = tf.contrib.learn.TensorFlowDNNClassifier(
+    classifier = tf.contrib.learn.DNNClassifier(
         feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3,
-        dropout=0.1)
-    classifier.fit(iris.data, iris.target)
+        dropout=0.1, config=tf.contrib.learn.RunConfig(tf_random_seed=1))
+    classifier.fit(iris.data, iris.target, max_steps=200)
     score = accuracy_score(iris.target, classifier.predict(iris.data))
     # If the quality is lower - dropout is not working.
     self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
@@ -94,10 +96,10 @@ class NonLinearTest(tf.test.TestCase):
     # Dropping out most of it.
     iris = tf.contrib.learn.datasets.load_iris()
     feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
-    classifier = tf.contrib.learn.TensorFlowDNNClassifier(
+    classifier = tf.contrib.learn.DNNClassifier(
         feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3,
-        dropout=0.9)
-    classifier.fit(iris.data, iris.target)
+        dropout=0.9, config=tf.contrib.learn.RunConfig(tf_random_seed=1))
+    classifier.fit(iris.data, iris.target, max_steps=200)
     score = accuracy_score(iris.target, classifier.predict(iris.data))
     self.assertGreater(score, 0.3, "Failed with score = {0}".format(score))
     # If the quality is higher - dropout is not working.
diff --git a/tensorflow/contrib/learn/python/learn/tests/run_config_test.py b/tensorflow/contrib/learn/python/learn/tests/run_config_test.py
index d4212316673..4164b450452 100644
--- a/tensorflow/contrib/learn/python/learn/tests/run_config_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/run_config_test.py
@@ -35,7 +35,8 @@ class RunConfigTest(tf.test.TestCase):
     self.assertEquals(config.num_ps_replicas, 0)
     self.assertIsNone(config.cluster_spec)
     self.assertIsNone(config.job_name)
-    self.assertIsNone(config.is_chief)
+    self.assertTrue(config.is_chief)
+    self.assertEquals(config.evaluation_master, "")
 
   def test_values_from_tf_config(self):
     tf_config = {"cluster": {"ps": ["host1:1", "host2:2"],
@@ -45,12 +46,13 @@ class RunConfigTest(tf.test.TestCase):
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
       config = run_config.RunConfig()
 
-    self.assertEquals(config.master, "host4:4")
+    self.assertEquals(config.master, "grpc://host4:4")
     self.assertEquals(config.task, 1)
     self.assertEquals(config.num_ps_replicas, 2)
     self.assertEquals(config.cluster_spec.as_dict(), tf_config["cluster"])
     self.assertEquals(config.job_name, "worker")
     self.assertFalse(config.is_chief)
+    self.assertEquals(config.evaluation_master, "")
 
   def test_explicitly_specified_values(self):
     cluster_spec = tf.train.ClusterSpec({
@@ -61,7 +63,9 @@ class RunConfigTest(tf.test.TestCase):
         master="localhost:0",
         task=2,
         job_name="my_job_name",
-        cluster_spec=cluster_spec,)
+        cluster_spec=cluster_spec,
+        evaluation_master="localhost:9991"
+    )
 
     self.assertEquals(config.master, "localhost:0")
     self.assertEquals(config.task, 2)
@@ -69,6 +73,7 @@ class RunConfigTest(tf.test.TestCase):
     self.assertEquals(config.cluster_spec, cluster_spec)
     self.assertEquals(config.job_name, "my_job_name")
     self.assertFalse(config.is_chief)
+    self.assertEquals(config.evaluation_master, "localhost:9991")
 
   def test_tf_config_with_overrides(self):
     # Purpose: to test the case where TF_CONFIG is set, but then
@@ -198,6 +203,39 @@ class RunConfigTest(tf.test.TestCase):
 
     self.assertTrue(config.is_chief)
 
+  def test_default_is_chief_from_tf_config_without_job_name(self):
+    tf_config = {"cluster": {},
+                 "task": {}}
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      config = run_config.RunConfig()
+
+    self.assertTrue(config.is_chief)
+
+  def test_default_is_chief_without_tf_config_or_job_name(self):
+    # When is_chief is omitted, there is no TF_CONFIG and no job_name
+    # (legacy behavior), then is_chief should be True iff task == 0.
+    config = run_config.RunConfig(task=0)
+    self.assertTrue(config.is_chief)
+
+    config = run_config.RunConfig(task=1)
+    self.assertFalse(config.is_chief)
+
+  def test_default_is_chief_without_tf_config_but_has_job_name(self):
+    # When is_chief is omitted, there is no TF_CONFIG but there is a job_name,
+    # then is_chief is True iff job_name is "worker" and task == 0.
+    config = run_config.RunConfig(job_name="worker", task=0)
+    self.assertTrue(config.is_chief)
+
+    config = run_config.RunConfig(
+        job_name="worker", task=1)
+    self.assertFalse(config.is_chief)
+
+    config = run_config.RunConfig(job_name="ps", task=0)
+    self.assertFalse(config.is_chief)
+
+    config = run_config.RunConfig(job_name="ps", task=1)
+    self.assertFalse(config.is_chief)
+
   def test_bad_is_chief_combinations_raise(self):
     msg = "Task is 1, but only task 0 may be chief"
     with self.assertRaisesRegexp(ValueError, msg):
diff --git a/tensorflow/contrib/learn/python/learn/tests/saver_test.py b/tensorflow/contrib/learn/python/learn/tests/saver_test.py
index 2c939712bbd..838a3ff33b2 100644
--- a/tensorflow/contrib/learn/python/learn/tests/saver_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/saver_test.py
@@ -62,17 +62,17 @@ class SaverTest(tf.test.TestCase):
     # self.assertGreater(score, 0.5, 'Failed with score = {0}'.format(score))
 
   def testDNN(self):
-    path = tf.test.get_temp_dir() + '/tmp_saver3'
     random.seed(42)
     iris = datasets.load_iris()
     cont_features = [
         tf.contrib.layers.real_valued_column('', dimension=4)]
-    classifier = learn.TensorFlowDNNClassifier(feature_columns=cont_features,
-                                               hidden_units=[10, 20, 10],
-                                               n_classes=3)
-    classifier.fit(iris.data, iris.target)
-    classifier.save(path)
+    classifier = learn.DNNClassifier(feature_columns=cont_features,
+                                     hidden_units=[10, 20, 10],
+                                     n_classes=3)
+    classifier.fit(iris.data, iris.target, max_steps=100)
     # TODO(ipolosukhin): Remove or restore.
+    # path = tf.test.get_temp_dir() + '/tmp_saver3'
+    # classifier.save(path)
     # new_classifier = learn.TensorFlowEstimator.restore(path)
     # self.assertEqual(type(new_classifier), type(classifier))
     # score = accuracy_score(iris.target, new_classifier.predict(iris.data))
@@ -83,17 +83,17 @@ class SaverTest(tf.test.TestCase):
       learn.TensorFlowEstimator.restore('no_model_path')
 
   def testNoCheckpoints(self):
-    path = tf.test.get_temp_dir() + '/tmp/tmp.saver4'
     random.seed(42)
     iris = datasets.load_iris()
     cont_features = [
         tf.contrib.layers.real_valued_column('', dimension=4)]
-    classifier = learn.TensorFlowDNNClassifier(feature_columns=cont_features,
-                                               hidden_units=[10, 20, 10],
-                                               n_classes=3)
-    classifier.fit(iris.data, iris.target)
-    classifier.save(path)
+    classifier = learn.DNNClassifier(feature_columns=cont_features,
+                                     hidden_units=[10, 20, 10],
+                                     n_classes=3)
+    classifier.fit(iris.data, iris.target, max_steps=100)
     # TODO(ipolosukhin): Remove or restore.
+    # path = tf.test.get_temp_dir() + '/tmp/tmp.saver4'
+    # classifier.save(path)
 #     os.remove(os.path.join(path, 'checkpoint'))
 #     with self.assertRaises(NotImplementedError):
 #       learn.TensorFlowEstimator.restore(path)
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 7f2218ada7a..5da8502845c 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -421,7 +421,8 @@ $(wildcard tensorflow/core/platform/*.cc) \
 $(wildcard tensorflow/core/platform/*/*.cc) \
 $(wildcard tensorflow/core/platform/*/*/*.cc) \
 $(wildcard tensorflow/core/util/*.cc) \
-$(wildcard tensorflow/core/util/*/*.cc)
+$(wildcard tensorflow/core/util/*/*.cc) \
+tensorflow/core/util/version_info.cc
 CORE_CC_EXCLUDE_SRCS := \
 $(wildcard tensorflow/core/*/*test.cc) \
 $(wildcard tensorflow/core/*/*testutil*) \
@@ -477,6 +478,11 @@ all: $(LIB_PATH) $(BENCHMARK_NAME)
 
 # Rules for target compilation.
 
+
+.phony_version_info:
+tensorflow/core/util/version_info.cc: .phony_version_info
+	python tensorflow/tools/git/gen_git_source.py --raw_generate $@
+
 # Gathers together all the objects we've compiled into a single '.a' archive.
 $(LIB_PATH): $(LIB_OBJS)
 	@mkdir -p $(dir $@)
@@ -553,6 +559,7 @@ $(HOST_GENDIR)%.pb.cc $(HOST_GENDIR)%.pb.h: %.proto
 # Gets rid of all generated files.
 clean:
 	rm -rf $(MAKEFILE_DIR)/gen
+	rm -rf tensorflow/core/util/version_info.cc
 
 # Gets rid of target files only, leaving the host alone. Also leaves the lib
 # directory untouched deliberately, so we can persist multiple architectures
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 5a8b8c86e52..b100e87d6c4 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -68,6 +68,7 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
+load("//tensorflow:tensorflow.bzl", "tf_version_info_genrule")
 
 # For platform specific build config
 load(
@@ -938,6 +939,13 @@ cc_library(
     ],
 )
 
+tf_version_info_genrule()
+
+cc_library(
+    name = "version_lib",
+    srcs = ["util/version_info.cc"],
+)
+
 tf_cuda_library(
     name = "framework_internal",
     srcs = glob(
@@ -980,6 +988,7 @@ tf_cuda_library(
         ":lib_internal",
         ":proto_text",
         ":protos_all_cc",
+        ":version_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//third_party/eigen3",
     ],
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 9ef85d645d5..6751921030f 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -121,14 +121,19 @@ void IntraProcessRendezvous::RecvAsync(const ParsedKey& parsed,
                                            const Rendezvous::Args& recv_args,
                                            const Tensor& in, bool is_dead) {
     Status s = status;
-    Tensor* out = new Tensor;
+
+    // If "in" is an uninitialized tensor, do copy-construction to preserve
+    // the uninitialized state, along with data type and shape info, which
+    // is useful for debugger purposes.
+    Tensor* out = in.IsInitialized() ? new Tensor : new Tensor(in);
+
     StatusCallback final_callback = [done, send_args, recv_args, out,
                                      is_dead](const Status& s) {
       done(s, send_args, recv_args, *out, is_dead);
       delete out;
     };
 
-    if (s.ok()) {
+    if (s.ok() && in.IsInitialized()) {
       SameWorkerRecvDone(parsed, send_args, recv_args, in, out, final_callback);
     } else {
       final_callback(s);
diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc
index 1d1f01a5b2b..f7897d77641 100644
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@@ -450,7 +450,7 @@ TEST_F(SessionDebugMinusAXTest,
   }
 }
 
-class SessionDebugGPUVariableTest : public ::testing::Test {
+class SessionDebugVariableTest : public ::testing::Test {
  public:
   void Initialize() {
     Graph graph(OpRegistry::Global());
@@ -509,7 +509,84 @@ class SessionDebugGPUVariableTest : public ::testing::Test {
   GraphDef def_;
 };
 
-TEST_F(SessionDebugGPUVariableTest, VariableAssignWithDebugOps) {
+TEST_F(SessionDebugVariableTest, WatchUninitializedVariableWithDebugOps) {
+  Initialize();
+  std::unique_ptr<DirectSession> session(CreateSession());
+  ASSERT_TRUE(session != nullptr);
+
+  DebugGateway debug_gateway(session.get());
+
+  TF_ASSERT_OK(session->Create(def_));
+
+  // Set up DebugTensorWatch for an uninitialized tensor (in node var).
+  RunOptions run_opts;
+  const string debug_identity = "DebugIdentity";
+  DebugTensorWatch* tensor_watch_opts = run_opts.add_debug_tensor_watch_opts();
+  tensor_watch_opts->set_node_name(var_node_name_);
+  tensor_watch_opts->set_output_slot(0);
+  tensor_watch_opts->add_debug_ops(debug_identity);
+
+  // Expected name of the inserted debug node
+  string debug_identity_node_name = DebugNodeInserter::GetDebugNodeName(
+      strings::StrCat(var_node_name_, ":", 0), 0, debug_identity);
+
+  // Supply completion and value callbacks
+  mutex mu;
+  // Completed nodes with and without outputs
+  std::vector<string> completed_debug_nodes;
+
+  Notification callbacks_done;
+  debug_gateway.SetNodeCompletionCallback(
+      [this, &mu, &debug_identity_node_name, &completed_debug_nodes,
+       &callbacks_done](const string& node_name, const bool any_output) {
+        mutex_lock l(mu);
+        if (any_output && (node_name == debug_identity_node_name)) {
+          completed_debug_nodes.push_back(node_name);
+        }
+      });
+
+  std::vector<Tensor> debug_identity_tensor_vals;
+
+  debug_gateway.SetNodeValueCallback(
+      [this, &mu, &debug_identity_node_name, &debug_identity_tensor_vals,
+       &callbacks_done](const string& node_name, const int output_slot,
+                        const Tensor& tensor_value, const bool is_ref) {
+        mutex_lock l(mu);
+        if (node_name == debug_identity_node_name && output_slot == 0) {
+          // output_slot == 0 carries the debug signal. Same below.
+          debug_identity_tensor_vals.push_back(tensor_value);
+        }
+
+        // Set the notification once we have the value from the target node.
+        if (node_name == init_node_name_ && !callbacks_done.HasBeenNotified()) {
+          callbacks_done.Notify();
+        }
+      });
+
+  // First run the initialization op
+  std::vector<std::pair<string, Tensor>> inputs_init;
+  std::vector<Tensor> outputs_init;
+
+  RunMetadata run_metadata;
+  Status s = session->Run(run_opts, inputs_init, {init_node_name_}, {},
+                          &outputs_init, &run_metadata);
+  TF_ASSERT_OK(s);
+
+  callbacks_done.WaitForNotification();
+
+  ASSERT_EQ(1, completed_debug_nodes.size());
+  ASSERT_EQ(
+      1, std::count(completed_debug_nodes.begin(), completed_debug_nodes.end(),
+                    debug_identity_node_name));
+
+  // Assert the output reflects the uninitialized nature of var's tensor.
+  ASSERT_EQ(1, debug_identity_tensor_vals.size());
+  ASSERT_FALSE(debug_identity_tensor_vals[0].IsInitialized());
+  ASSERT_EQ(DT_FLOAT, debug_identity_tensor_vals[0].dtype());
+  ASSERT_EQ(TensorShape({3}), debug_identity_tensor_vals[0].shape());
+}
+
+TEST_F(SessionDebugVariableTest, VariableAssignWithDebugOps) {
   // Tensor contains one count of NaN
   Initialize();
   std::unique_ptr<DirectSession> session(CreateSession());
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index 553ae9ab7d2..2366abeda26 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -30,6 +30,8 @@ class DebugIO {
   // Args:
   //   tensor_name: Name of the tensor being published: node_name followed by
   //     a colon, followed by the output slot index. E.g., "node_a:0".
+  //     N.B.: Use the original tensor name, i.e., name of the input tensor to
+  //     the debug op, even if the debug_op is not DebugIdentity.
   //   debug_op: Name of the debug op, e.g., "DebugIdentity".
   //   tensor: The Tensor object being published.
   //   wall_time_us: Time stamp for the Tensor. Unit: microseconds (us).
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 17c87ffab28..95c0e49b9c8 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace tensorflow {
 
@@ -44,24 +45,31 @@ class CopyOp : public OpKernel {
     bool off_host_input = device->device_type() == DEVICE_GPU &&
                           !context->input_alloc_attr(0).on_host();
 
-    Tensor* copied_tensor;
-    OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(),
-                                                     &copied_tensor));
+    if (src_tensor.IsInitialized()) {
+      // Source tensor is initialized. Make a copy.
+      Tensor* copied_tensor;
+      OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(),
+                                                       &copied_tensor));
+
 #if GOOGLE_CUDA
-    if (off_host_input) {
-      // Input is not on host: deep-copy it from GPU to the same GPU.
-      Notification done_copy;
-      GPUUtil::CopyGPUTensorToSameGPU(
-          device, device_ctxt, &src_tensor, copied_tensor,
-          [&done_copy](const Status& s) { done_copy.Notify(); });
-      done_copy.WaitForNotification();
-    } else {
-      // The input tensor is on the host (CPU): deep-copy from CPU to CPU.
-      *copied_tensor = tensor::DeepCopy(src_tensor);
-    }
+      if (off_host_input) {
+        // Input is not on host: deep-copy it from GPU to the same GPU.
+        Notification done_copy;
+        GPUUtil::CopyGPUTensorToSameGPU(
+            device, device_ctxt, &src_tensor, copied_tensor,
+            [&done_copy](const Status& s) { done_copy.Notify(); });
+        done_copy.WaitForNotification();
+      } else {
+        // The input tensor is on the host (CPU): deep-copy from CPU to CPU.
+        *copied_tensor = tensor::DeepCopy(src_tensor);
+      }
 #else
-    *copied_tensor = tensor::DeepCopy(src_tensor);
-#endif  // GOOGLE_CUDA
+      *copied_tensor = tensor::DeepCopy(src_tensor);
+#endif
+    } else {
+      // Source tensor is NOT initialized. Forward the Tensor object.
+      context->set_output(0, src_tensor);
+    }
   }
 
   bool IsExpensive() override { return false; }
@@ -110,15 +118,19 @@ class DebugNanCountOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
 
-    const TensorShape& input_shape = input.shape();
-    const T* input_flat = input.template flat<T>().data();
-
-    // Count NaNs.
     // Use DT_INT64/int64 to be consistent with TensorShape::num_elements().
     int64 nan_count = 0;
-    for (int64 i = 0; i < input_shape.num_elements(); ++i) {
-      if (Eigen::numext::isnan(input_flat[i])) {
-        nan_count++;
+
+    // If the input is an uninitialized tensor, let nan_count be 0.
+    if (input.IsInitialized()) {
+      // Count NaNs.
+      const TensorShape& input_shape = input.shape();
+      const T* input_flat = input.template flat<T>().data();
+
+      for (int64 i = 0; i < input_shape.num_elements(); ++i) {
+        if (Eigen::numext::isnan(input_flat[i])) {
+          nan_count++;
+        }
       }
     }
 
@@ -127,6 +139,11 @@ class DebugNanCountOp : public OpKernel {
     Tensor* output_tensor;
     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
     output_tensor->vec<int64>()(0) = nan_count;
+
+    if (!debug_urls_.empty()) {
+      DebugIO::PublishDebugTensor(tensor_name_, "DebugNanCount", *output_tensor,
+                                  Env::Default()->NowMicros(), debug_urls_);
+    }
   }
 
   bool IsExpensive() override { return false; }
@@ -141,4 +158,4 @@ class DebugNanCountOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_KERNELS_IDENTITY_OP_H_
+#endif  // TENSORFLOW_KERNELS_DEBUG_OP_H_
diff --git a/tensorflow/core/lib/io/record_reader_writer_test.cc b/tensorflow/core/lib/io/record_reader_writer_test.cc
index a44c35d7fd1..0a656473e4b 100644
--- a/tensorflow/core/lib/io/record_reader_writer_test.cc
+++ b/tensorflow/core/lib/io/record_reader_writer_test.cc
@@ -67,4 +67,42 @@ TEST(RecordReaderWriterTest, TestBasics) {
   }
 }
 
+TEST(RecordReaderWriterTest, TestZlib) {
+  Env* env = Env::Default();
+  string fname = testing::TmpDir() + "/record_reader_writer_zlib_test";
+
+  for (auto buf_size : BufferSizes()) {
+    // Zlib compression needs output buffer size > 1.
+    if (buf_size == 1) continue;
+    {
+      std::unique_ptr<WritableFile> file;
+      TF_CHECK_OK(env->NewWritableFile(fname, &file));
+
+      io::RecordWriterOptions options;
+      options.compression_type = io::RecordWriterOptions::ZLIB_COMPRESSION;
+      options.zlib_options.output_buffer_size = buf_size;
+      io::RecordWriter writer(file.get(), options);
+      writer.WriteRecord("abc");
+      writer.WriteRecord("defg");
+      TF_CHECK_OK(writer.Flush());
+    }
+
+    {
+      std::unique_ptr<RandomAccessFile> read_file;
+      // Read it back with the RecordReader.
+      TF_CHECK_OK(env->NewRandomAccessFile(fname, &read_file));
+      io::RecordReaderOptions options;
+      options.compression_type = io::RecordReaderOptions::ZLIB_COMPRESSION;
+      options.zlib_options.input_buffer_size = buf_size;
+      io::RecordReader reader(read_file.get(), options);
+      uint64 offset = 0;
+      string record;
+      TF_CHECK_OK(reader.ReadRecord(&offset, &record));
+      EXPECT_EQ("abc", record);
+      TF_CHECK_OK(reader.ReadRecord(&offset, &record));
+      EXPECT_EQ("defg", record);
+    }
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/record_writer.cc b/tensorflow/core/lib/io/record_writer.cc
index 25873b83ba3..516332d2b73 100644
--- a/tensorflow/core/lib/io/record_writer.cc
+++ b/tensorflow/core/lib/io/record_writer.cc
@@ -33,6 +33,11 @@ RecordWriter::RecordWriter(WritableFile* dest,
     zlib_output_buffer_.reset(new ZlibOutputBuffer(
         dest_, options.zlib_options.input_buffer_size,
         options.zlib_options.output_buffer_size, options.zlib_options));
+    Status s = zlib_output_buffer_->Init();
+    if (!s.ok()) {
+      LOG(FATAL) << "Failed to initialize Zlib inputbuffer. Error: "
+                 << s.ToString();
+    }
 #endif  // IS_SLIM_BUILD
   } else if (options.compression_type == RecordWriterOptions::NONE) {
     // Nothing to do
diff --git a/tensorflow/core/lib/io/zlib_buffers_test.cc b/tensorflow/core/lib/io/zlib_buffers_test.cc
index eaaf1497594..1290e98ce2c 100644
--- a/tensorflow/core/lib/io/zlib_buffers_test.cc
+++ b/tensorflow/core/lib/io/zlib_buffers_test.cc
@@ -73,6 +73,7 @@ void TestAllCombinations(CompressionOptions input_options,
 
         ZlibOutputBuffer out(file_writer.get(), input_buf_size, output_buf_size,
                              output_options);
+        TF_CHECK_OK(out.Init());
 
         TF_CHECK_OK(out.Write(StringPiece(data)));
         TF_CHECK_OK(out.Close());
@@ -120,6 +121,7 @@ void TestMultipleWrites(uint8 input_buf_size, uint8 output_buf_size,
   TF_CHECK_OK(env->NewWritableFile(fname, &file_writer));
   ZlibOutputBuffer out(file_writer.get(), input_buf_size, output_buf_size,
                        output_options);
+  TF_CHECK_OK(out.Init());
 
   for (int i = 0; i < num_writes; i++) {
     TF_CHECK_OK(out.Write(StringPiece(data)));
@@ -172,6 +174,7 @@ TEST(ZlibInputStream, FailsToReadIfWindowBitsAreIncompatible) {
   string result;
   ZlibOutputBuffer out(file_writer.get(), input_buf_size, output_buf_size,
                        output_options);
+  TF_CHECK_OK(out.Init());
 
   TF_CHECK_OK(out.Write(StringPiece(data)));
   TF_CHECK_OK(out.Close());
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.cc b/tensorflow/core/lib/io/zlib_outputbuffer.cc
index 9493804bcb8..bdedfd00e86 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.cc
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 
+#include "tensorflow/core/lib/core/errors.h"
+
 namespace tensorflow {
 namespace io {
 
@@ -25,30 +27,13 @@ ZlibOutputBuffer::ZlibOutputBuffer(
     const ZlibCompressionOptions&
         zlib_options)  // size of z_stream.next_out buffer
     : file_(file),
+      init_status_(),
       input_buffer_capacity_(input_buffer_bytes),
       output_buffer_capacity_(output_buffer_bytes),
       z_stream_input_(new Bytef[input_buffer_bytes]),
       z_stream_output_(new Bytef[output_buffer_bytes]),
       zlib_options_(zlib_options),
-      z_stream_(new z_stream) {
-  memset(z_stream_.get(), 0, sizeof(z_stream));
-  z_stream_->zalloc = Z_NULL;
-  z_stream_->zfree = Z_NULL;
-  z_stream_->opaque = Z_NULL;
-  int status =
-      deflateInit2(z_stream_.get(), zlib_options.compression_level,
-                   zlib_options.compression_method, zlib_options.window_bits,
-                   zlib_options.mem_level, zlib_options.compression_strategy);
-  if (status != Z_OK) {
-    LOG(FATAL) << "deflateInit failed with status " << status;
-    z_stream_.reset(NULL);
-  } else {
-    z_stream_->next_in = z_stream_input_.get();
-    z_stream_->next_out = z_stream_output_.get();
-    z_stream_->avail_in = 0;
-    z_stream_->avail_out = output_buffer_capacity_;
-  }
-}
+      z_stream_(new z_stream) {}
 
 ZlibOutputBuffer::~ZlibOutputBuffer() {
   if (z_stream_.get()) {
@@ -56,6 +41,33 @@ ZlibOutputBuffer::~ZlibOutputBuffer() {
   }
 }
 
+Status ZlibOutputBuffer::Init() {
+  // Output buffer size should be greater than 1 because deflation needs atleast
+  // one byte for book keeping etc.
+  if (output_buffer_capacity_ <= 1) {
+    return errors::InvalidArgument(
+        "output_buffer_bytes should be greater than "
+        "1");
+  }
+  memset(z_stream_.get(), 0, sizeof(z_stream));
+  z_stream_->zalloc = Z_NULL;
+  z_stream_->zfree = Z_NULL;
+  z_stream_->opaque = Z_NULL;
+  int status =
+      deflateInit2(z_stream_.get(), zlib_options_.compression_level,
+                   zlib_options_.compression_method, zlib_options_.window_bits,
+                   zlib_options_.mem_level, zlib_options_.compression_strategy);
+  if (status != Z_OK) {
+    z_stream_.reset(NULL);
+    return errors::InvalidArgument("deflateInit failed with status", status);
+  }
+  z_stream_->next_in = z_stream_input_.get();
+  z_stream_->next_out = z_stream_output_.get();
+  z_stream_->avail_in = 0;
+  z_stream_->avail_out = output_buffer_capacity_;
+  return Status::OK();
+}
+
 int32 ZlibOutputBuffer::AvailableInputSpace() const {
   return input_buffer_capacity_ - z_stream_->avail_in;
 }
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.h b/tensorflow/core/lib/io/zlib_outputbuffer.h
index 08455b63b50..a53c40b8fbc 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.h
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.h
@@ -45,6 +45,7 @@ class ZlibOutputBuffer {
   // 2. the deflated output
   // with sizes `input_buffer_bytes` and `output_buffer_bytes` respectively.
   // Does not take ownership of `file`.
+  // output_buffer_bytes should be greater than 1.
   ZlibOutputBuffer(
       WritableFile* file,
       int32 input_buffer_bytes,   // size of z_stream.next_in buffer
@@ -53,6 +54,10 @@ class ZlibOutputBuffer {
 
   ~ZlibOutputBuffer();
 
+  // Initializes some state necessary for the output buffer. This call is
+  // required before any other operation on the buffer.
+  Status Init();
+
   // Adds `data` to the compression pipeline.
   //
   // The input data is buffered in `z_stream_input_` and is compressed in bulk
@@ -78,6 +83,7 @@ class ZlibOutputBuffer {
 
  private:
   WritableFile* file_;  // Not owned
+  Status init_status_;
   size_t input_buffer_capacity_;
   size_t output_buffer_capacity_;
 
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc
index 1c315655c1e..8ae56b4906a 100644
--- a/tensorflow/core/lib/strings/str_util.cc
+++ b/tensorflow/core/lib/strings/str_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <ctype.h>
 #include <vector>
 #include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace tensorflow {
 namespace str_util {
@@ -334,5 +335,58 @@ bool SplitAndParseAsInts(StringPiece text, char delim,
   return true;
 }
 
+string HumanReadableElapsedTime(double seconds) {
+  string human_readable;
+
+  if (seconds < 0) {
+    human_readable = "-";
+    seconds = -seconds;
+  }
+
+  // Start with us and keep going up to years.
+  // The comparisons must account for rounding to prevent the format breaking
+  // the tested condition and returning, e.g., "1e+03 us" instead of "1 ms".
+  const double microseconds = seconds * 1.0e6;
+  if (microseconds < 999.5) {
+    strings::Appendf(&human_readable, "%0.3g us", microseconds);
+    return human_readable;
+  }
+  double milliseconds = seconds * 1e3;
+  if (milliseconds >= .995 && milliseconds < 1) {
+    // Round half to even in Appendf would convert this to 0.999 ms.
+    milliseconds = 1.0;
+  }
+  if (milliseconds < 999.5) {
+    strings::Appendf(&human_readable, "%0.3g ms", milliseconds);
+    return human_readable;
+  }
+  if (seconds < 60.0) {
+    strings::Appendf(&human_readable, "%0.3g s", seconds);
+    return human_readable;
+  }
+  seconds /= 60.0;
+  if (seconds < 60.0) {
+    strings::Appendf(&human_readable, "%0.3g min", seconds);
+    return human_readable;
+  }
+  seconds /= 60.0;
+  if (seconds < 24.0) {
+    strings::Appendf(&human_readable, "%0.3g h", seconds);
+    return human_readable;
+  }
+  seconds /= 24.0;
+  if (seconds < 30.0) {
+    strings::Appendf(&human_readable, "%0.3g days", seconds);
+    return human_readable;
+  }
+  if (seconds < 365.2425) {
+    strings::Appendf(&human_readable, "%0.3g months", seconds / 30.436875);
+    return human_readable;
+  }
+  seconds /= 365.2425;
+  strings::Appendf(&human_readable, "%0.3g years", seconds);
+  return human_readable;
+}
+
 }  // namespace str_util
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h
index 4a519425d47..8198b7c5418 100644
--- a/tensorflow/core/lib/strings/str_util.h
+++ b/tensorflow/core/lib/strings/str_util.h
@@ -80,6 +80,15 @@ string Uppercase(StringPiece s);
 // set of characters that can be used as word boundaries.
 void TitlecaseString(string* s, StringPiece delimiters);
 
+// Converts a time interval as double to a human readable
+// string. For example:
+//   0.001       -> "1 ms"
+//   10.0        -> "10 s"
+//   933120.0    -> "10.8 days"
+//   39420000.0  -> "1.25 years"
+//   -10         -> "-10 s"
+string HumanReadableElapsedTime(double seconds);
+
 // Join functionality
 template <typename T>
 string Join(const T& s, const char* sep);
diff --git a/tensorflow/core/lib/strings/str_util_test.cc b/tensorflow/core/lib/strings/str_util_test.cc
index a0494190fda..6481e5cfd75 100644
--- a/tensorflow/core/lib/strings/str_util_test.cc
+++ b/tensorflow/core/lib/strings/str_util_test.cc
@@ -287,4 +287,23 @@ TEST(TitlecaseString, Basic) {
   ASSERT_EQ(s, "Dense");
 }
 
+TEST(HumanReadableElapsedTime, Basic) {
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(-10), "-10 s");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(-0.001), "-1 ms");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(-60.0), "-1 min");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(0.00000001), "0.01 us");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(0.0000012), "1.2 us");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(0.0012), "1.2 ms");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(0.12), "120 ms");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(1.12), "1.12 s");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(90.0), "1.5 min");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(600.0), "10 min");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(9000.0), "2.5 h");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(87480.0), "1.01 days");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(7776000.0), "2.96 months");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(78840000.0), "2.5 years");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(382386614.40), "12.1 years");
+  EXPECT_EQ(str_util::HumanReadableElapsedTime(DBL_MAX), "5.7e+300 years");
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 45723e48cfb..fc1d031d4b7 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -3364,6 +3364,7 @@ REGISTER_OP("Copy")
     .Output("output: T")
     .Attr("T: type")
     .Attr("tensor_name: string = ''")
+    .SetAllowsUninitializedInput()
     .Doc(R"doc(
 Copy Op.
 
@@ -3383,6 +3384,7 @@ REGISTER_OP("CopyHost")
     .Output("output: T")
     .Attr("T: type")
     .Attr("tensor_name: string = ''")
+    .SetAllowsUninitializedInput()
     .Doc(R"doc(
 Copy Host Op.
 
@@ -3401,6 +3403,7 @@ REGISTER_OP("DebugIdentity")
     .Attr("T: type")
     .Attr("tensor_name: string = ''")
     .Attr("debug_urls: list(string) = []")
+    .SetAllowsUninitializedInput()
     .Doc(R"doc(
 Debug Identity Op.
 
@@ -3419,6 +3422,7 @@ REGISTER_OP("DebugNanCount")
     .Attr("T: type")
     .Attr("tensor_name: string = ''")
     .Attr("debug_urls: list(string) = []")
+    .SetAllowsUninitializedInput()
     .Doc(R"doc(
 Debug NaN Value Counter Op
 
diff --git a/tensorflow/core/ops/compat/ops_history.v0.pbtxt b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
index 8543aa0cb0d..7fb470a2fdd 100644
--- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
@@ -7705,6 +7705,29 @@ op {
     }
   }
 }
+op {
+  name: "Copy"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  allows_uninitialized_input: true
+}
 op {
   name: "CopyHost"
   input_arg {
@@ -7727,6 +7750,29 @@ op {
     }
   }
 }
+op {
+  name: "CopyHost"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  allows_uninitialized_input: true
+}
 op {
   name: "Cos"
   input_arg {
@@ -8194,6 +8240,37 @@ op {
     }
   }
 }
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
 op {
   name: "DebugNanCount"
   input_arg {
@@ -8246,6 +8323,37 @@ op {
     }
   }
 }
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
 op {
   name: "DecodeBase64"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 946e098eba2..4325da85585 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4058,6 +4058,7 @@ op {
   }
   summary: "Copy Op."
   description: "Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the\ndevice on which the tensor is allocated.\n\nUnlike the CopyHost Op, this op does not have HostMemory constraint on its\ninput or output."
+  allows_uninitialized_input: true
 }
 op {
   name: "CopyHost"
@@ -4085,6 +4086,7 @@ op {
   }
   summary: "Copy Host Op."
   description: "Performs CPU-to-CPU deep-copying of tensor.\n\nUnlike the Copy Op, this op has HostMemory constraint on its input or output."
+  allows_uninitialized_input: true
 }
 op {
   name: "Cos"
@@ -4498,6 +4500,7 @@ op {
   }
   summary: "Debug Identity Op."
   description: "Provides an identity mapping of the non-Ref type input tensor for debugging."
+  allows_uninitialized_input: true
 }
 op {
   name: "DebugNanCount"
@@ -4534,6 +4537,7 @@ op {
   }
   summary: "Debug NaN Value Counter Op"
   description: "Counts number of NaNs in the input tensor, for debugging."
+  allows_uninitialized_input: true
 }
 op {
   name: "DecodeBase64"
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index ab82617a136..2c6a9cde91b 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -38,10 +38,11 @@ static Status HandleGradAndIndicesInputs(InferenceContext* c, bool sparse,
   DimensionHandle unused;
   TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused));
 
-  // Trailing part of grad matches *s.
-  ShapeHandle grad_subshape;
-  TF_RETURN_IF_ERROR(c->Subshape(grad, 1, &grad_subshape));
-  TF_RETURN_IF_ERROR(c->Merge(*s, grad_subshape, s));
+  // Trailing part of grad matches trailing part of *s.
+  ShapeHandle grad_unknown_first;
+  TF_RETURN_IF_ERROR(
+      c->ReplaceDim(grad, 0, c->UnknownDim(), &grad_unknown_first));
+  TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s));
 
   return Status::OK();
 }
diff --git a/tensorflow/core/ops/training_ops_test.cc b/tensorflow/core/ops/training_ops_test.cc
index d1023a1e73d..9c3489211c8 100644
--- a/tensorflow/core/ops/training_ops_test.cc
+++ b/tensorflow/core/ops/training_ops_test.cc
@@ -30,15 +30,14 @@ static void TestGradAndIndicesErrorHandling(ShapeInferenceTestOp op,
                            grad_indices_spec, shape_spec_end);
   };
 
-  // mismatch between grad[1] and var[0].
-  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
-              shape_spec("[1]", "[?,2];[?]").c_str());
+  // mismatch between grad[1] and var[1].
+  INFER_ERROR("Dimension 1 in both shapes must be equal", op,
+              shape_spec("[?,1]", "[?,2];[?]").c_str());
   // grad[0] and indices[0] must match.
   INFER_ERROR("Dimensions must be equal, but are 1 and 2", op,
               shape_spec("?", "[2,?];[1]").c_str());
   // grad is wrong rank.
-  INFER_ERROR("Shapes must be equal rank, but are 1 and 0", op,
-              shape_spec("[1]", "[2];[?]").c_str());
+  INFER_ERROR("must be equal rank", op, shape_spec("[1]", "[?,2];[?]").c_str());
   // indices is wrong rank.
   INFER_ERROR("Shape must be rank 1 but is rank 2", op,
               shape_spec("[?]", "[?];[1,2]").c_str());
@@ -74,7 +73,7 @@ TEST(TrainingOpsTest, SparseApplyProximalGradientDescent_ShapeFn) {
   ShapeInferenceTestOp op("SparseApplyProximalGradientDescent");
 
   // Output is a merge of inputs 0 (var) and the non-indices part of 4 (delta).
-  INFER_OK(op, "[1,?];[];[];[];[?,?,2];[3]", "[d0_0,d4_2]");
+  INFER_OK(op, "[1,?];[];[];[];[?,2];[3]", "[d0_0,d4_1]");
 
   TestGradAndIndicesErrorHandling(op, "[];[];[]");
 
@@ -109,14 +108,14 @@ TEST(TrainingOpsTest, SparseApplyAdadelta_ShapeFn) {
 
   // Output is a merge of inputs 0, 1, 2, and non-indices part of 6 (var, accum,
   // accum_update, grad).
-  INFER_OK(op, "[1,?,?,?];[?,2,?,?];[?,?,3,?];[];[];[];[?,?,?,?,4];?",
-           "[d0_0,d1_1,d2_2,d6_4]");
+  INFER_OK(op, "[1,?,?,?];[?,2,?,?];[?,?,3,?];[];[];[];[?,?,?,4];?",
+           "[d0_0,d1_1,d2_2,d6_3]");
   INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
               "[1];[2];[1];[];[];[];[1];?");
   INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
               "[1];[1];[2];[];[];[];[1];?");
-  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
-              "[1];[1];[1];[];[];[];[?,2];?");
+  INFER_ERROR("Dimension 1 in both shapes must be equal, but are 1 and 2", op,
+              "[?,1];[?,1];[?,1];[];[];[];[?,2];?");
 
   TestGradAndIndicesErrorHandling(op, "?;?;?;?;?");
 
@@ -145,11 +144,11 @@ TEST(TrainingOpsTest, SparseApplyAdagrad_ShapeFn) {
 
   // Output is a merge of inputs 0, 1, and non-indices part of 3 (var, accum,
   // grad).
-  INFER_OK(op, "[1,?,?];[?,2,?];[];[?,?,?,3];?", "[d0_0,d1_1,d3_3]");
-  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
-              "[1];[2];[];[1];?");
-  INFER_ERROR("Shapes must be equal rank, but are 1 and 0", op,
-              "[1];[1];[];[2];?");
+  INFER_OK(op, "[1,?,?];[?,2,?];[];[?,?,3];?", "[d0_0,d1_1,d3_2]");
+  INFER_ERROR("Dimension 1 in both shapes must be equal, but are 1 and 2", op,
+              "[?,1];[?,2];[];[?,1];?");
+  INFER_ERROR("Shapes must be equal rank, but are 2 and 3", op,
+              "[?,1];[?,1];[];[?,?,2];?");
 
   TestGradAndIndicesErrorHandling(op, "?;?");
 
@@ -178,11 +177,11 @@ TEST(TrainingOpsTest, SparseApplyProximalAdagrad_ShapeFn) {
 
   // Output is a merge of inputs 0, 1, and the non-indices part of 5 (var,
   // accum, grad).
-  INFER_OK(op, "[1,?,?];[?,2,?];[];[];[];[?,?,?,3];?", "[d0_0,d1_1,d5_3]");
+  INFER_OK(op, "[1,?,?];[?,2,?];[];[];[];[?,?,3];?", "[d0_0,d1_1,d5_2]");
   INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
               "[1];[2];[];[];[];[?,1];?");
-  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
-              "[1];[1];[];[];[];[?,2];?");
+  INFER_ERROR("Dimension 1 in both shapes must be equal, but are 1 and 2", op,
+              "[?,1];[?,1];[];[];[];[?,2];?");
 
   TestGradAndIndicesErrorHandling(op, "?;?;?;?");
 
@@ -217,14 +216,14 @@ TEST(TrainingOpsTest, SparseApplyFtrl_ShapeFn) {
 
   // Output is a merge of inputs 0, 1, 2, and non-indices part of 3 (var, accum,
   // linear, grad).
-  INFER_OK(op, "[1,?,?,?];[?,2,?,?];[?,?,3,?];[?,?,?,?,4];?;[];[];[];[]",
-           "[d0_0,d1_1,d2_2,d3_4]");
+  INFER_OK(op, "[1,?,?,?];[?,2,?,?];[?,?,3,?];[?,?,?,4];?;[];[];[];[]",
+           "[d0_0,d1_1,d2_2,d3_3]");
   INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
               "[1];[2];[1];[?,1];?;[];[];[];[]");
   INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
               "[1];[1];[2];[?,1];?;[];[];[];[]");
-  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
-              "[1];[1];[1];[?,2];?;[];[];[];[]");
+  INFER_ERROR("Dimension 1 in both shapes must be equal, but are 1 and 2", op,
+              "[?,1];[?,1];[?,1];[?,2];?;[];[];[];[]");
 
   TestGradAndIndicesErrorHandling(op, "?;?", ";?;?;?;?");
 
@@ -255,11 +254,11 @@ TEST(TrainingOpsTest, SparseApplyMomentum_ShapeFn) {
 
   // Output is a merge of inputs 0, 1, and non-indices part of 3 (var, accum,
   // grad).
-  INFER_OK(op, "[1,?,?];[?,2,?];[];[?,?,?,3];?;[]", "[d0_0,d1_1,d3_3]");
-  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
-              "[1];[2];[];[?,1];?;[]");
-  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
-              "[1];[1];[];[?,2];?;[]");
+  INFER_OK(op, "[1,?,?];[?,2,?];[];[?,?,3];?;[]", "[d0_0,d1_1,d3_2]");
+  INFER_ERROR("Dimension 1 in both shapes must be equal, but are 1 and 2", op,
+              "[?,1];[?,2];[];[?,1];?;[]");
+  INFER_ERROR("Dimension 1 in both shapes must be equal, but are 1 and 2", op,
+              "[?,1];[?,1];[];[?,2];?;[]");
 
   TestGradAndIndicesErrorHandling(op, "?;?", ";?");
 
@@ -316,14 +315,14 @@ TEST(TrainingOpsTest, SparseApplyRMSProp_ShapeFn) {
 
   // Output is a merge of inputs 0, 1, 2, and the non-indices part of 7 (var,
   // ms, mom, and grad).
-  INFER_OK(op, "[1,?,?,?];[?,2,?,?];[?,?,3,?];[];[];[];[];[?,?,?,?,4];?",
-           "[d0_0,d1_1,d2_2,d7_4]");
+  INFER_OK(op, "[1,?,?,?];[?,2,?,?];[?,?,3,?];[];[];[];[];[?,?,?,4];?",
+           "[d0_0,d1_1,d2_2,d7_3]");
   INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
-              "[1];[2];[1];[];[];[];[];[?,1];?");
+              "[1];[2];[1];[];[];[];[];[1];?");
   INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
-              "[1];[1];[2];[];[];[];[];[?,1];?");
-  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
-              "[1];[1];[1];[];[];[];[];[?,2];?");
+              "[1];[1];[2];[];[];[];[];[1];?");
+  INFER_ERROR("Dimension 1 in both shapes must be equal, but are 1 and 2", op,
+              "[?,1];[?,1];[?,1];[];[];[];[];[?,2];?");
 
   TestGradAndIndicesErrorHandling(op, "?;?;?;?;?;?");
 
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index a9b63ea1f4c..037e5795f78 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -84,4 +84,12 @@ limitations under the License.
 #define TF_CHECKPOINT_VERSION_MIN_CONSUMER 0
 #define TF_CHECKPOINT_VERSION 1
 
+/// Version query functions (defined in generated version_info.cc)
+
+// Host compiler version (declared elsewhere to be __VERSION__)
+extern const char* tf_compiler_version();
+// The git commit designator when tensorflow was built
+// If no git repository, this will be "internal".
+extern const char* tf_git_version();
+
 #endif  // TENSORFLOW_CORE_PUBLIC_VERSION_H_
diff --git a/tensorflow/core/util/stat_summarizer.h b/tensorflow/core/util/stat_summarizer.h
index 775c8e54e9b..2dd3dbd5cd4 100644
--- a/tensorflow/core/util/stat_summarizer.h
+++ b/tensorflow/core/util/stat_summarizer.h
@@ -176,7 +176,7 @@ class StatSummarizer {
     std::vector<TensorDescription> outputs;
   };
 
-  enum struct SortingMetric {
+  enum SortingMetric {
     BY_TOTAL,
     BY_RUN_ORDER,
   };
diff --git a/tensorflow/examples/skflow/hdf5_classification.py b/tensorflow/examples/skflow/hdf5_classification.py
index 50e7d73b954..7c630836893 100644
--- a/tensorflow/examples/skflow/hdf5_classification.py
+++ b/tensorflow/examples/skflow/hdf5_classification.py
@@ -45,12 +45,11 @@ y_test = h5f['y_test']
 
 # Build 3 layer DNN with 10, 20, 10 units respectively.
 feature_columns = learn.infer_real_valued_columns_from_input(x_train)
-classifier = learn.TensorFlowDNNClassifier(
-    feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3,
-    steps=200)
+classifier = learn.DNNClassifier(
+    feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3)
 
 # Fit and predict.
-classifier.fit(x_train, y_train)
+classifier.fit(x_train, y_train, steps=200)
 score = metrics.accuracy_score(y_test, classifier.predict(x_test))
 print('Accuracy: {0:f}'.format(score))
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.md b/tensorflow/g3doc/api_docs/python/contrib.learn.md
index f5de19ae3e1..c9f31842d5c 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.learn.md
@@ -1296,532 +1296,6 @@ component of a nested object.
 
 
 
-- - -
-
-### `class tf.contrib.learn.TensorFlowDNNClassifier` {#TensorFlowDNNClassifier}
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.__init__(*args, **kwargs)` {#TensorFlowDNNClassifier.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.bias_` {#TensorFlowDNNClassifier.bias_}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.config` {#TensorFlowDNNClassifier.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.dnn_bias_` {#TensorFlowDNNClassifier.dnn_bias_}
-
-Returns bias of deep neural network part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.dnn_weights_` {#TensorFlowDNNClassifier.dnn_weights_}
-
-Returns weights of deep neural network part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowDNNClassifier.evaluate}
-
-See `Evaluable`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.export(*args, **kwargs)` {#TensorFlowDNNClassifier.export}
-
-Exports inference graph into given dir. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
-Instructions for updating:
-The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn and input_feature_key will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
-
-    Args:
-      export_dir: A string containing a directory to write the exported graph
-        and checkpoints.
-      input_fn: If `use_deprecated_input_fn` is true, then a function that given
-        `Tensor` of `Example` strings, parses it into features that are then
-        passed to the model. Otherwise, a function that takes no argument and
-        returns a tuple of (features, targets), where features is a dict of
-        string key to `Tensor` and targets is a `Tensor` that's currently not
-        used (and so can be `None`).
-      input_feature_key: Only used if `use_deprecated_input_fn` is false. String
-        key into the features dict returned by `input_fn` that corresponds to
-        the raw `Example` strings `Tensor` that the exported model will take as
-        input.
-      use_deprecated_input_fn: Determines the signature format of `input_fn`.
-      signature_fn: Function that returns a default signature and a named
-        signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-        for features and `Tensor` or `dict` of `Tensor`s for predictions.
-      default_batch_size: Default batch size of the `Example` placeholder.
-      exports_to_keep: Number of exports to keep.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.fit(x, y, steps=None, batch_size=None, monitors=None, logdir=None)` {#TensorFlowDNNClassifier.fit}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.get_params(deep=True)` {#TensorFlowDNNClassifier.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.get_variable_names()` {#TensorFlowDNNClassifier.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.get_variable_value(name)` {#TensorFlowDNNClassifier.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.linear_bias_` {#TensorFlowDNNClassifier.linear_bias_}
-
-Returns bias of the linear part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.linear_weights_` {#TensorFlowDNNClassifier.linear_weights_}
-
-Returns weights per feature of the linear part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.model_dir` {#TensorFlowDNNClassifier.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.partial_fit(x=None, y=None, input_fn=None, steps=1, batch_size=None, monitors=None)` {#TensorFlowDNNClassifier.partial_fit}
-
-Incremental fit on a batch of samples.
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.predict(x=None, input_fn=None, batch_size=None, outputs=None, axis=1)` {#TensorFlowDNNClassifier.predict}
-
-Predict class or regression for `x`.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.predict_proba(x=None, input_fn=None, batch_size=None, outputs=None)` {#TensorFlowDNNClassifier.predict_proba}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.save(path)` {#TensorFlowDNNClassifier.save}
-
-Saves checkpoints and graph to given path.
-
-##### Args:
-
-
-*  <b>`path`</b>: Folder to save model to.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.set_params(**params)` {#TensorFlowDNNClassifier.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.weights_` {#TensorFlowDNNClassifier.weights_}
-
-
-
-
-
-- - -
-
-### `class tf.contrib.learn.TensorFlowDNNRegressor` {#TensorFlowDNNRegressor}
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.__init__(*args, **kwargs)` {#TensorFlowDNNRegressor.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.bias_` {#TensorFlowDNNRegressor.bias_}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.config` {#TensorFlowDNNRegressor.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.dnn_bias_` {#TensorFlowDNNRegressor.dnn_bias_}
-
-Returns bias of deep neural network part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.dnn_weights_` {#TensorFlowDNNRegressor.dnn_weights_}
-
-Returns weights of deep neural network part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowDNNRegressor.evaluate}
-
-See `Evaluable`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.export(*args, **kwargs)` {#TensorFlowDNNRegressor.export}
-
-Exports inference graph into given dir. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
-Instructions for updating:
-The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn and input_feature_key will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
-
-    Args:
-      export_dir: A string containing a directory to write the exported graph
-        and checkpoints.
-      input_fn: If `use_deprecated_input_fn` is true, then a function that given
-        `Tensor` of `Example` strings, parses it into features that are then
-        passed to the model. Otherwise, a function that takes no argument and
-        returns a tuple of (features, targets), where features is a dict of
-        string key to `Tensor` and targets is a `Tensor` that's currently not
-        used (and so can be `None`).
-      input_feature_key: Only used if `use_deprecated_input_fn` is false. String
-        key into the features dict returned by `input_fn` that corresponds to
-        the raw `Example` strings `Tensor` that the exported model will take as
-        input.
-      use_deprecated_input_fn: Determines the signature format of `input_fn`.
-      signature_fn: Function that returns a default signature and a named
-        signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-        for features and `Tensor` or `dict` of `Tensor`s for predictions.
-      default_batch_size: Default batch size of the `Example` placeholder.
-      exports_to_keep: Number of exports to keep.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.fit(x, y, steps=None, batch_size=None, monitors=None, logdir=None)` {#TensorFlowDNNRegressor.fit}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.get_params(deep=True)` {#TensorFlowDNNRegressor.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.get_variable_names()` {#TensorFlowDNNRegressor.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.get_variable_value(name)` {#TensorFlowDNNRegressor.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.linear_bias_` {#TensorFlowDNNRegressor.linear_bias_}
-
-Returns bias of the linear part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.linear_weights_` {#TensorFlowDNNRegressor.linear_weights_}
-
-Returns weights per feature of the linear part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.model_dir` {#TensorFlowDNNRegressor.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.partial_fit(x=None, y=None, input_fn=None, steps=1, batch_size=None, monitors=None)` {#TensorFlowDNNRegressor.partial_fit}
-
-Incremental fit on a batch of samples.
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.predict(x=None, input_fn=None, batch_size=None, outputs=None, axis=1)` {#TensorFlowDNNRegressor.predict}
-
-Predict class or regression for `x`.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.predict_proba(x=None, input_fn=None, batch_size=None, outputs=None)` {#TensorFlowDNNRegressor.predict_proba}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.save(path)` {#TensorFlowDNNRegressor.save}
-
-Saves checkpoints and graph to given path.
-
-##### Args:
-
-
-*  <b>`path`</b>: Folder to save model to.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.set_params(**params)` {#TensorFlowDNNRegressor.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.weights_` {#TensorFlowDNNRegressor.weights_}
-
-
-
-
-
 - - -
 
 ### `class tf.contrib.learn.TensorFlowEstimator` {#TensorFlowEstimator}
@@ -3479,7 +2953,7 @@ If you're a Google-internal user using command line flags with learn_runner.py
 probably want to use learn_runner.EstimatorConfig instead.
 - - -
 
-#### `tf.contrib.learn.RunConfig.__init__(master=None, task=None, num_ps_replicas=None, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, cluster_spec=None, tf_random_seed=None, save_summary_steps=100, save_checkpoints_secs=600, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, job_name=None, is_chief=None)` {#RunConfig.__init__}
+#### `tf.contrib.learn.RunConfig.__init__(master=None, task=None, num_ps_replicas=None, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, cluster_spec=None, tf_random_seed=None, save_summary_steps=100, save_checkpoints_secs=600, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, job_name=None, is_chief=None, evaluation_master='')` {#RunConfig.__init__}
 
 Constructor.
 
@@ -3548,6 +3022,7 @@ Example:
     must exist in the `cluster_spec.jobs`.
 *  <b>`is_chief`</b>: whether or not this task (as identified by the other parameters)
     should be the chief task.
+*  <b>`evaluation_master`</b>: the master on which to perform evaluation.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.TensorFlowDNNRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.TensorFlowDNNRegressor.md
deleted file mode 100644
index 20046f02451..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.TensorFlowDNNRegressor.md
+++ /dev/null
@@ -1,258 +0,0 @@
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.__init__(*args, **kwargs)` {#TensorFlowDNNRegressor.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.bias_` {#TensorFlowDNNRegressor.bias_}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.config` {#TensorFlowDNNRegressor.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.dnn_bias_` {#TensorFlowDNNRegressor.dnn_bias_}
-
-Returns bias of deep neural network part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.dnn_weights_` {#TensorFlowDNNRegressor.dnn_weights_}
-
-Returns weights of deep neural network part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowDNNRegressor.evaluate}
-
-See `Evaluable`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.export(*args, **kwargs)` {#TensorFlowDNNRegressor.export}
-
-Exports inference graph into given dir. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
-Instructions for updating:
-The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn and input_feature_key will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
-
-    Args:
-      export_dir: A string containing a directory to write the exported graph
-        and checkpoints.
-      input_fn: If `use_deprecated_input_fn` is true, then a function that given
-        `Tensor` of `Example` strings, parses it into features that are then
-        passed to the model. Otherwise, a function that takes no argument and
-        returns a tuple of (features, targets), where features is a dict of
-        string key to `Tensor` and targets is a `Tensor` that's currently not
-        used (and so can be `None`).
-      input_feature_key: Only used if `use_deprecated_input_fn` is false. String
-        key into the features dict returned by `input_fn` that corresponds to
-        the raw `Example` strings `Tensor` that the exported model will take as
-        input.
-      use_deprecated_input_fn: Determines the signature format of `input_fn`.
-      signature_fn: Function that returns a default signature and a named
-        signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-        for features and `Tensor` or `dict` of `Tensor`s for predictions.
-      default_batch_size: Default batch size of the `Example` placeholder.
-      exports_to_keep: Number of exports to keep.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.fit(x, y, steps=None, batch_size=None, monitors=None, logdir=None)` {#TensorFlowDNNRegressor.fit}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.get_params(deep=True)` {#TensorFlowDNNRegressor.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.get_variable_names()` {#TensorFlowDNNRegressor.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.get_variable_value(name)` {#TensorFlowDNNRegressor.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.linear_bias_` {#TensorFlowDNNRegressor.linear_bias_}
-
-Returns bias of the linear part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.linear_weights_` {#TensorFlowDNNRegressor.linear_weights_}
-
-Returns weights per feature of the linear part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.model_dir` {#TensorFlowDNNRegressor.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.partial_fit(x=None, y=None, input_fn=None, steps=1, batch_size=None, monitors=None)` {#TensorFlowDNNRegressor.partial_fit}
-
-Incremental fit on a batch of samples.
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.predict(x=None, input_fn=None, batch_size=None, outputs=None, axis=1)` {#TensorFlowDNNRegressor.predict}
-
-Predict class or regression for `x`.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.predict_proba(x=None, input_fn=None, batch_size=None, outputs=None)` {#TensorFlowDNNRegressor.predict_proba}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.save(path)` {#TensorFlowDNNRegressor.save}
-
-Saves checkpoints and graph to given path.
-
-##### Args:
-
-
-*  <b>`path`</b>: Folder to save model to.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.set_params(**params)` {#TensorFlowDNNRegressor.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNRegressor.weights_` {#TensorFlowDNNRegressor.weights_}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md
index 5e830ee139d..ea27fd27579 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.RunConfig.md
@@ -5,7 +5,7 @@ If you're a Google-internal user using command line flags with learn_runner.py
 probably want to use learn_runner.EstimatorConfig instead.
 - - -
 
-#### `tf.contrib.learn.RunConfig.__init__(master=None, task=None, num_ps_replicas=None, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, cluster_spec=None, tf_random_seed=None, save_summary_steps=100, save_checkpoints_secs=600, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, job_name=None, is_chief=None)` {#RunConfig.__init__}
+#### `tf.contrib.learn.RunConfig.__init__(master=None, task=None, num_ps_replicas=None, num_cores=4, log_device_placement=False, gpu_memory_fraction=1, cluster_spec=None, tf_random_seed=None, save_summary_steps=100, save_checkpoints_secs=600, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, job_name=None, is_chief=None, evaluation_master='')` {#RunConfig.__init__}
 
 Constructor.
 
@@ -74,6 +74,7 @@ Example:
     must exist in the `cluster_spec.jobs`.
 *  <b>`is_chief`</b>: whether or not this task (as identified by the other parameters)
     should be the chief task.
+*  <b>`evaluation_master`</b>: the master on which to perform evaluation.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.TensorFlowDNNClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.TensorFlowDNNClassifier.md
deleted file mode 100644
index 3f46aefb69a..00000000000
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.TensorFlowDNNClassifier.md
+++ /dev/null
@@ -1,258 +0,0 @@
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.__init__(*args, **kwargs)` {#TensorFlowDNNClassifier.__init__}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.bias_` {#TensorFlowDNNClassifier.bias_}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.config` {#TensorFlowDNNClassifier.config}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.dnn_bias_` {#TensorFlowDNNClassifier.dnn_bias_}
-
-Returns bias of deep neural network part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.dnn_weights_` {#TensorFlowDNNClassifier.dnn_weights_}
-
-Returns weights of deep neural network part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowDNNClassifier.evaluate}
-
-See `Evaluable`.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` or `y` is provided, and at least one of
-      `input_fn` or `feed_fn` is provided.
-      Or if `metrics` is not `None` or `dict`.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.export(*args, **kwargs)` {#TensorFlowDNNClassifier.export}
-
-Exports inference graph into given dir. (deprecated arguments)
-
-SOME ARGUMENTS ARE DEPRECATED. They will be removed after 2016-09-23.
-Instructions for updating:
-The signature of the input_fn accepted by export is changing to be consistent with what's used by tf.Learn Estimator's train/evaluate. input_fn and input_feature_key will become required args, and use_deprecated_input_fn will default to False and be removed altogether.
-
-    Args:
-      export_dir: A string containing a directory to write the exported graph
-        and checkpoints.
-      input_fn: If `use_deprecated_input_fn` is true, then a function that given
-        `Tensor` of `Example` strings, parses it into features that are then
-        passed to the model. Otherwise, a function that takes no argument and
-        returns a tuple of (features, targets), where features is a dict of
-        string key to `Tensor` and targets is a `Tensor` that's currently not
-        used (and so can be `None`).
-      input_feature_key: Only used if `use_deprecated_input_fn` is false. String
-        key into the features dict returned by `input_fn` that corresponds to
-        the raw `Example` strings `Tensor` that the exported model will take as
-        input.
-      use_deprecated_input_fn: Determines the signature format of `input_fn`.
-      signature_fn: Function that returns a default signature and a named
-        signature map, given `Tensor` of `Example` strings, `dict` of `Tensor`s
-        for features and `Tensor` or `dict` of `Tensor`s for predictions.
-      default_batch_size: Default batch size of the `Example` placeholder.
-      exports_to_keep: Number of exports to keep.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.fit(x, y, steps=None, batch_size=None, monitors=None, logdir=None)` {#TensorFlowDNNClassifier.fit}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.get_params(deep=True)` {#TensorFlowDNNClassifier.get_params}
-
-Get parameters for this estimator.
-
-##### Args:
-
-
-*  <b>`deep`</b>: boolean, optional
-
-    If `True`, will return the parameters for this estimator and
-    contained subobjects that are estimators.
-
-##### Returns:
-
-  params : mapping of string to any
-  Parameter names mapped to their values.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.get_variable_names()` {#TensorFlowDNNClassifier.get_variable_names}
-
-Returns list of all variable names in this model.
-
-##### Returns:
-
-  List of names.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.get_variable_value(name)` {#TensorFlowDNNClassifier.get_variable_value}
-
-Returns value of the variable given by name.
-
-##### Args:
-
-
-*  <b>`name`</b>: string, name of the tensor.
-
-##### Returns:
-
-  Numpy array - value of the tensor.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.linear_bias_` {#TensorFlowDNNClassifier.linear_bias_}
-
-Returns bias of the linear part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.linear_weights_` {#TensorFlowDNNClassifier.linear_weights_}
-
-Returns weights per feature of the linear part.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.model_dir` {#TensorFlowDNNClassifier.model_dir}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.partial_fit(x=None, y=None, input_fn=None, steps=1, batch_size=None, monitors=None)` {#TensorFlowDNNClassifier.partial_fit}
-
-Incremental fit on a batch of samples.
-
-This method is expected to be called several times consecutively
-on different or the same chunks of the dataset. This either can
-implement iterative training or out-of-core/online training.
-
-This is especially useful when the whole dataset is too big to
-fit in memory at the same time. Or when model is taking long time
-to converge, and you want to split up training into subparts.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-
-##### Returns:
-
-  `self`, for chaining.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If at least one of `x` and `y` is provided, and `input_fn` is
-      provided.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.predict(x=None, input_fn=None, batch_size=None, outputs=None, axis=1)` {#TensorFlowDNNClassifier.predict}
-
-Predict class or regression for `x`.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.predict_proba(x=None, input_fn=None, batch_size=None, outputs=None)` {#TensorFlowDNNClassifier.predict_proba}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.save(path)` {#TensorFlowDNNClassifier.save}
-
-Saves checkpoints and graph to given path.
-
-##### Args:
-
-
-*  <b>`path`</b>: Folder to save model to.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.set_params(**params)` {#TensorFlowDNNClassifier.set_params}
-
-Set the parameters of this estimator.
-
-The method works on simple estimators as well as on nested objects
-(such as pipelines). The former have parameters of the form
-``<component>__<parameter>`` so that it's possible to update each
-component of a nested object.
-
-##### Args:
-
-
-*  <b>`**params`</b>: Parameters.
-
-##### Returns:
-
-  self
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: If params contain invalid names.
-
-
-- - -
-
-#### `tf.contrib.learn.TensorFlowDNNClassifier.weights_` {#TensorFlowDNNClassifier.weights_}
-
-
-
-
diff --git a/tensorflow/g3doc/api_docs/python/index.md b/tensorflow/g3doc/api_docs/python/index.md
index 3de9afccb5f..8a794418db2 100644
--- a/tensorflow/g3doc/api_docs/python/index.md
+++ b/tensorflow/g3doc/api_docs/python/index.md
@@ -854,8 +854,6 @@
   * [`run_feeds`](../../api_docs/python/contrib.learn.md#run_feeds)
   * [`run_n`](../../api_docs/python/contrib.learn.md#run_n)
   * [`RunConfig`](../../api_docs/python/contrib.learn.md#RunConfig)
-  * [`TensorFlowDNNClassifier`](../../api_docs/python/contrib.learn.md#TensorFlowDNNClassifier)
-  * [`TensorFlowDNNRegressor`](../../api_docs/python/contrib.learn.md#TensorFlowDNNRegressor)
   * [`TensorFlowEstimator`](../../api_docs/python/contrib.learn.md#TensorFlowEstimator)
   * [`TensorFlowRNNClassifier`](../../api_docs/python/contrib.learn.md#TensorFlowRNNClassifier)
   * [`TensorFlowRNNRegressor`](../../api_docs/python/contrib.learn.md#TensorFlowRNNRegressor)
diff --git a/tensorflow/g3doc/contrib/learn/get_started/index.md b/tensorflow/g3doc/contrib/learn/get_started/index.md
index f34c3456cf7..c99dbe962e5 100644
--- a/tensorflow/g3doc/contrib/learn/get_started/index.md
+++ b/tensorflow/g3doc/contrib/learn/get_started/index.md
@@ -44,8 +44,10 @@ Example of 3 layer network with 10, 20 and 10 hidden units respectively:
     from sklearn import datasets, metrics
 
     iris = datasets.load_iris()
-    classifier = learn.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=3)
-    classifier.fit(iris.data, iris.target)
+    feature_columns = learn.infer_real_valued_columns_from_input(iris.data)
+    classifier = learn.DNNClassifier(
+        feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3)
+    classifier.fit(iris.data, iris.target, steps=100)
     score = metrics.accuracy_score(iris.target, classifier.predict(iris.data))
     print("Accuracy: %f" % score)
 
diff --git a/tensorflow/g3doc/tutorials/seq2seq/index.md b/tensorflow/g3doc/tutorials/seq2seq/index.md
index c1673f474d5..10f39553fd0 100644
--- a/tensorflow/g3doc/tutorials/seq2seq/index.md
+++ b/tensorflow/g3doc/tutorials/seq2seq/index.md
@@ -126,7 +126,7 @@ In the above invocation, we set `feed_previous` to False. This means that the
 decoder will use `decoder_inputs` tensors as provided. If we set `feed_previous`
 to True, the decoder would only use the first element of `decoder_inputs`.
 All other tensors from this list would be ignored, and instead the previous
-output of the encoder would be used. This is used for decoding translations
+output of the decoder would be used. This is used for decoding translations
 in our translation model, but it can also be used during training, to make
 the model more robust to its own mistakes, similar
 to [Bengio et al., 2015](http://arxiv.org/abs/1506.03099)
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 974d3c24573..72a149330f8 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -245,4 +245,8 @@ __all__.extend([
     'train',
 ])
 
-__all__.append('__version__')
+__all__.extend([
+    '__version__',
+    '__git_version__',
+    '__compiler_version__',
+])
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 87391fff68d..8afd3f77f66 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -35,6 +35,12 @@ tensorflow::ImportNumpy();
 %constant int GRAPH_DEF_VERSION_MIN_CONSUMER = TF_GRAPH_DEF_VERSION_MIN_CONSUMER;
 %constant int GRAPH_DEF_VERSION_MIN_PRODUCER = TF_GRAPH_DEF_VERSION_MIN_PRODUCER;
 
+// Git version information
+%constant const char* __git_version__ = tf_git_version();
+
+// Compiler
+%constant const char* __compiler_version__ = tf_compiler_version();
+
 // Release the Python GIL for the duration of most methods.
 %exception {
   Py_BEGIN_ALLOW_THREADS;
diff --git a/tensorflow/python/debug/debug_data.py b/tensorflow/python/debug/debug_data.py
index 5eda336105b..4c4ef461abf 100644
--- a/tensorflow/python/debug/debug_data.py
+++ b/tensorflow/python/debug/debug_data.py
@@ -35,12 +35,20 @@ def load_tensor_from_event_file(event_file_path):
     event_file_path: Path to the event file.
 
   Returns:
-    The tensor value loaded from the event file.
+    The tensor value loaded from the event file. For uninitialized tensors,
+    return None.
   """
   event = event_pb2.Event()
   with open(event_file_path, "rb") as f:
     event.ParseFromString(f.read())
-    tensor_value = tensor_util.MakeNdarray(event.summary.value[0].tensor)
+
+    if (event.summary.value[0].tensor.tensor_content or
+        event.summary.value[0].tensor.string_val):
+      # Initialized tensor.
+      tensor_value = tensor_util.MakeNdarray(event.summary.value[0].tensor)
+    else:
+      # Uninitialized tensor.
+      tensor_value = None
 
   return tensor_value
 
diff --git a/tensorflow/python/debug/session_debug_test.py b/tensorflow/python/debug/session_debug_test.py
index 0fd1146c878..de75108a3eb 100644
--- a/tensorflow/python/debug/session_debug_test.py
+++ b/tensorflow/python/debug/session_debug_test.py
@@ -179,6 +179,56 @@ class SessionDebugTest(test_util.TensorFlowTestCase):
           dump.get_rel_timestamps("%s/read" % str2_name, 0, "DebugIdentity")[0],
           0)
 
+  def testDumpUninitializedVariable(self):
+    op_namespace = "testDumpUninitializedVariable"
+    with session.Session() as sess:
+      u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
+      s_init_val = b"str1"
+
+      u_name = "%s/u" % op_namespace
+      s_name = "%s/s" % op_namespace
+
+      u_init = constant_op.constant(u_init_val, shape=[2, 2])
+      u = variables.Variable(u_init, name=u_name)
+      s_init = constant_op.constant(s_init_val)
+      s = variables.Variable(s_init, name=s_name)
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_url = "file://%s" % self._dump_root
+
+      # Add debug tensor watch for u.
+      self._addDebugTensorWatch(
+          run_options, "%s" % u_name, 0, debug_urls=[debug_url])
+      self._addDebugTensorWatch(
+          run_options, "%s" % s_name, 0, debug_urls=[debug_url])
+
+      run_metadata = config_pb2.RunMetadata()
+
+      # Initialize u and s.
+      sess.run(variables.initialize_all_variables(),
+               options=run_options,
+               run_metadata=run_metadata)
+
+      # Verify the dump file for the uninitialized value of u.
+      dump = debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs)
+
+      self.assertEqual(2, dump.size)
+      self.assertEqual(self._expected_partition_graph_count,
+                       len(run_metadata.partition_graphs))
+
+      # Verify that the variable is properly initialized by the run() call.
+      u_vals = dump.get_tensors(u_name, 0, "DebugIdentity")
+      s_vals = dump.get_tensors(s_name, 0, "DebugIdentity")
+      self.assertEqual(1, len(u_vals))
+      self.assertIsNone(u_vals[0])
+      self.assertEqual(1, len(s_vals))
+      self.assertIsNone(s_vals[0])
+
+      # Call run() again, to check that u is initialized properly.
+      self.assertAllClose(u_init_val, sess.run(u))
+      self.assertEqual(s_init_val, sess.run(s))
+
   def testDumpToFileWhileLoop(self):
     with session.Session() as sess:
       num_iter = 10
diff --git a/tensorflow/python/framework/versions.py b/tensorflow/python/framework/versions.py
index 369dd701cf6..63722e041cf 100644
--- a/tensorflow/python/framework/versions.py
+++ b/tensorflow/python/framework/versions.py
@@ -22,6 +22,9 @@ from __future__ import print_function
 from tensorflow.python import pywrap_tensorflow
 
 __version__ = pywrap_tensorflow.__version__
+__git_version__ = pywrap_tensorflow.__git_version__
+__compiler_version__ = pywrap_tensorflow.__compiler_version__
+
 GRAPH_DEF_VERSION = pywrap_tensorflow.GRAPH_DEF_VERSION
 GRAPH_DEF_VERSION_MIN_CONSUMER = (
     pywrap_tensorflow.GRAPH_DEF_VERSION_MIN_CONSUMER)
@@ -30,4 +33,5 @@ GRAPH_DEF_VERSION_MIN_PRODUCER = (
 
 # Make sure these symbols are exported even though one starts with _.
 __all__ = ["__version__", "GRAPH_DEF_VERSION", "GRAPH_DEF_VERSION_MIN_CONSUMER",
-           "GRAPH_DEF_VERSION_MIN_PRODUCER"]
+           "GRAPH_DEF_VERSION_MIN_PRODUCER", "__git_version__",
+           "__compiler_version__"]
diff --git a/tensorflow/python/framework/versions_test.py b/tensorflow/python/framework/versions_test.py
index 6042048414f..025c311a2a1 100644
--- a/tensorflow/python/framework/versions_test.py
+++ b/tensorflow/python/framework/versions_test.py
@@ -38,6 +38,9 @@ class VersionTest(tf.test.TestCase):
     self.assertLessEqual(0, min_producer)
     self.assertLessEqual(min_producer, version)
 
+  def testGitVersion(self):
+    self.assertEqual(type(tf.__git_version__), str)
+    self.assertEqual(type(tf.__compiler_version__), str)
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index bce249d8c5e..e48ed569118 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -605,6 +605,27 @@ class VariableScopeTest(tf.test.TestCase):
           with tf.name_scope("scope2") as sc2:
             self.assertEqual(sc2, "outer_1/default/scope2/")
 
+  def testGetLocalVar(self):
+    with self.test_session():
+      # Check that local variable respects naming.
+      with tf.variable_scope("outer") as outer:
+        with tf.variable_scope(outer, "default", []):
+          local_var = variable_scope.get_local_variable(
+              "w", [], collections=["foo"])
+          self.assertEqual(local_var.name, "outer/w:0")
+
+      # Since variable is local, it should be in the local variable collection
+      # but not the the trainable collection.
+      self.assertIn(local_var, tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES))
+      self.assertIn(local_var, tf.get_collection("foo"))
+      self.assertNotIn(
+          local_var, tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))
+
+      # Check that local variable respects `reuse`.
+      with tf.variable_scope(outer, "default", reuse=True):
+        self.assertEqual(variable_scope.get_local_variable("w", []).name,
+                         "outer/w:0")
+
 
 def axis0_into1_partitioner(shape=None, **unused_kwargs):
   part = [1] * len(shape)
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index bd5d9b1b447..7afe9e70abf 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -1977,7 +1977,7 @@ class WhileContext(ControlFlowContext):
     else:
       values_shape = array_ops.shape_internal(op.inputs[0], optimize=False)[1:]
       values_shape = array_ops.concat(0, [[1], values_shape])
-      values_acc = array_ops.zeros(values_shape)
+      values_acc = array_ops.zeros(values_shape, dtype=values.dtype)
     indices_acc = constant_op.constant([0], indices.dtype)
     shape_acc = None
     if dense_shape is not None:
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 23813c4001d..f4352fcdb58 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework.test_util import TensorFlowTestCase
 from tensorflow.python.ops import control_flow_ops
@@ -172,55 +173,57 @@ class SwitchTestCase(TensorFlowTestCase):
         self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
 
   def testIndexedSlicesWithShapeGradientInWhileLoop(self):
-    with self.test_session() as sess:
-      num_steps = 9
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session() as sess:
+        num_steps = 9
 
-      inputs = tf.placeholder(dtype="float32", shape=[num_steps])
-      initial_outputs = tf.TensorArray(dtype="float32", size=num_steps)
-      initial_i = tf.constant(0, dtype="int32")
+        inputs = tf.placeholder(dtype=dtype, shape=[num_steps])
+        initial_outputs = tf.TensorArray(dtype=dtype, size=num_steps)
+        initial_i = tf.constant(0, dtype=dtypes.int32)
 
-      def Cond(i, _):
-        return i < num_steps
+        def Cond(i, _):
+          return i < num_steps  # pylint: disable=cell-var-from-loop
 
-      def Body(i, outputs):
-        x = tf.gather(inputs, i)
-        outputs = outputs.write(i, x)
-        return i + 1, outputs
+        def Body(i, outputs):
+          x = tf.gather(inputs, i)  # pylint: disable=cell-var-from-loop
+          outputs = outputs.write(i, x)
+          return i + 1, outputs
 
-      _, outputs = tf.while_loop(Cond, Body, [initial_i, initial_outputs])
+        _, outputs = tf.while_loop(Cond, Body, [initial_i, initial_outputs])
 
-      outputs = tf.reduce_sum(outputs.pack())
-      r = tf.gradients([outputs], [inputs])[0]
-      grad_wr_inputs = ops.convert_to_tensor(r)
-      o, grad = sess.run([outputs, grad_wr_inputs],
-                         feed_dict={inputs: [4, 6, 0, 7, 0, 0, 1, 2, 0]})
-      self.assertEquals(o, 20)
-      self.assertAllEqual(grad, [1] * num_steps)
+        outputs = tf.reduce_sum(outputs.pack())
+        r = tf.gradients([outputs], [inputs])[0]
+        grad_wr_inputs = ops.convert_to_tensor(r)
+        o, grad = sess.run([outputs, grad_wr_inputs],
+                           feed_dict={inputs: [4, 6, 0, 7, 0, 0, 1, 2, 0]})
+        self.assertEquals(o, 20)
+        self.assertAllEqual(grad, [1] * num_steps)
 
   def testIndexedSlicesWithDynamicShapeGradientInWhileLoop(self):
-    with self.test_session() as sess:
-      inputs = tf.placeholder(dtype="float32")
-      initial_outputs = tf.TensorArray(dtype="float32", dynamic_size=True,
-                                       size=1)
-      initial_i = tf.constant(0, dtype="int32")
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session() as sess:
+        inputs = tf.placeholder(dtype=dtype)
+        initial_outputs = tf.TensorArray(dtype=dtype, dynamic_size=True,
+                                         size=1)
+        initial_i = tf.constant(0, dtype=dtypes.int32)
 
-      def Cond(i, _):
-        return i < tf.size(inputs)
+        def Cond(i, _):
+          return i < tf.size(inputs)  # pylint: disable=cell-var-from-loop
 
-      def Body(i, outputs):
-        x = tf.gather(inputs, i)
-        outputs = outputs.write(i, x)
-        return i + 1, outputs
+        def Body(i, outputs):
+          x = tf.gather(inputs, i)  # pylint: disable=cell-var-from-loop
+          outputs = outputs.write(i, x)
+          return i + 1, outputs
 
-      _, outputs = tf.while_loop(Cond, Body, [initial_i, initial_outputs])
+        _, outputs = tf.while_loop(Cond, Body, [initial_i, initial_outputs])
 
-      outputs = tf.reduce_sum(outputs.pack())
-      r = tf.gradients([outputs], [inputs])[0]
-      grad_wr_inputs = ops.convert_to_tensor(r)
-      o, grad = sess.run([outputs, grad_wr_inputs],
-                         feed_dict={inputs: [1, 3, 2]})
-      self.assertEquals(o, 6)
-      self.assertAllEqual(grad, [1] * 3)
+        outputs = tf.reduce_sum(outputs.pack())
+        r = tf.gradients([outputs], [inputs])[0]
+        grad_wr_inputs = ops.convert_to_tensor(r)
+        o, grad = sess.run([outputs, grad_wr_inputs],
+                           feed_dict={inputs: [1, 3, 2]})
+        self.assertEquals(o, 6)
+        self.assertAllEqual(grad, [1] * 3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py
index 7990dba3b63..df34de3da41 100644
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@@ -138,7 +138,6 @@ from __future__ import print_function
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.ops import gen_io_ops
 # go/tf-wildcard-import
@@ -205,80 +204,12 @@ def _restore_slice(file_pattern, tensor_name, shape_and_slice, tensor_type,
       preferred_shard, name=name)
 
 
-@ops.RegisterShape("Restore")
-def _RestoreShape(op):
-  """Shape function for Restore op."""
-  # Validate input shapes.
-  unused_file_pattern = op.inputs[0].get_shape().merge_with(
-      tensor_shape.scalar())
-  unused_tensor_name = op.inputs[1].get_shape().merge_with(
-      tensor_shape.scalar())
-  return [tensor_shape.unknown_shape()]
-
-
-@ops.RegisterShape("RestoreSlice")
-def _RestoreSliceShape(op):
-  """Shape function for RestoreSlice op."""
-  # Validate input shapes.
-  unused_file_pattern = op.inputs[0].get_shape().merge_with(
-      tensor_shape.scalar())
-  unused_tensor_name = op.inputs[1].get_shape().merge_with(
-      tensor_shape.scalar())
-  unused_shape_and_slice_shape = op.inputs[2].get_shape().merge_with(
-      tensor_shape.scalar())
-  # TODO(mrry): Attempt to parse the shape_and_slice value and use it
-  # to form the shape of the output.
-  return [tensor_shape.unknown_shape()]
-
-
-@ops.RegisterShape("Save")
-def _SaveShape(op):
-  """Shape function for Save op."""
-  # Validate input shapes.
-  unused_filename = op.inputs[0].get_shape().merge_with(tensor_shape.scalar())
-  data_count = len(op.inputs) - 2
-  unused_tensor_names_shape = op.inputs[1].get_shape().merge_with(
-      tensor_shape.vector(data_count))
-  return []
-
-
-@ops.RegisterShape("SaveSlices")
-def _SaveSlicesShape(op):
-  """Shape function for SaveSlices op."""
-  # Validate input shapes.
-  unused_filename = op.inputs[0].get_shape().merge_with(tensor_shape.scalar())
-  data_count = len(op.inputs) - 3
-  unused_tensor_names_shape = op.inputs[1].get_shape().merge_with(
-      tensor_shape.vector(data_count))
-  unused_shapes_and_slices_shape = op.inputs[2].get_shape().merge_with(
-      tensor_shape.vector(data_count))
-  # TODO(mrry): Attempt to parse the shapes_and_slices values and use
-  # them to constrain the shape of the remaining inputs.
-  return []
-
-
-@ops.RegisterShape("ShardedFilename")
-def _ShardedFilenameShape(op):
-  """Shape function for ShardedFilename op."""
-  # Validate input shapes.
-  unused_basename_shape = op.inputs[0].get_shape().merge_with(
-      tensor_shape.scalar())
-  unused_shard_shape = op.inputs[1].get_shape().merge_with(
-      tensor_shape.scalar())
-  unused_num_shards_shape = op.inputs[2].get_shape().merge_with(
-      tensor_shape.scalar())
-  return [tensor_shape.scalar()]
-
-
-@ops.RegisterShape("ShardedFilespec")
-def _ShardedFilespecShape(op):
-  """Shape function for ShardedFilespec op."""
-  # Validate input shapes.
-  unused_basename_shape = op.inputs[0].get_shape().merge_with(
-      tensor_shape.scalar())
-  unused_num_shards_shape = op.inputs[1].get_shape().merge_with(
-      tensor_shape.scalar())
-  return [tensor_shape.scalar()]
+ops.RegisterShape("Restore")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("RestoreSlice")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("Save")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("SaveSlices")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ShardedFilename")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ShardedFilespec")(common_shapes.call_cpp_shape_fn)
 
 
 class ReaderBase(object):
@@ -574,61 +505,13 @@ ops.RegisterShape("TextLineReader")(common_shapes.scalar_shape)
 ops.RegisterShape("WholeFileReader")(common_shapes.scalar_shape)
 ops.RegisterShape("TFRecordReader")(common_shapes.scalar_shape)
 
-
-@ops.RegisterShape("ReaderNumRecordsProduced")
-@ops.RegisterShape("ReaderNumWorkUnitsCompleted")
-@ops.RegisterShape("ReaderSerializeState")
-def _ReaderScalarShape(op):
-  """Shape function for ops that transform a reader to a scalar."""
-  unused_handle_shape = op.inputs[0].get_shape().merge_with(
-      tensor_shape.scalar())
-  return [tensor_shape.scalar()]
-
-
-@ops.RegisterShape("ReaderRead")
-def _ReaderReadShape(op):
-  """Shape function for the ReaderBase.Read op."""
-  unused_handle_shape = op.inputs[0].get_shape().merge_with(
-      tensor_shape.scalar())
-  unused_queue_shape = op.inputs[1].get_shape().merge_with(
-      tensor_shape.scalar())
-  return [tensor_shape.scalar(), tensor_shape.scalar()]
-
-
-@ops.RegisterShape("ReaderReadUpTo")
-def _ReaderReadUpToShape(_):
-  """Shape function for the ReaderBase.ReadUpTo op."""
-  return [tensor_shape.unknown_shape(ndims=1),
-          tensor_shape.unknown_shape(ndims=1)]
-
-
-@ops.RegisterShape("ReaderReset")
-def _ReaderResetShape(op):
-  """Shape function for the ReaderBase.Reset op."""
-  unused_handle_shape = op.inputs[0].get_shape().merge_with(
-      tensor_shape.scalar())
-  return []
-
-
-@ops.RegisterShape("ReaderRestoreState")
-def _ReaderRestoreStateShape(op):
-  """Shape function for the ReaderBase.Restore op."""
-  unused_handle_shape = op.inputs[0].get_shape().merge_with(
-      tensor_shape.scalar())
-  unused_state_shape = op.inputs[1].get_shape().merge_with(
-      tensor_shape.scalar())
-  return []
-
-
-@ops.RegisterShape("ReadFile")
-def _ReadFileShape(op):
-  """Shape function for the ReadFile op."""
-  return [op.inputs[0].get_shape().merge_with(tensor_shape.scalar())]
-
-
-@ops.RegisterShape("MatchingFiles")
-def _MatchingFilesShape(op):
-  """Shape function for the MatchingFiles op."""
-  unused_patern_shape = op.inputs[0].get_shape().merge_with(
-      tensor_shape.scalar())
-  return [tensor_shape.unknown_shape(ndims=1)]
+ops.RegisterShape("ReaderNumRecordsProduced")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ReaderNumWorkUnitsCompleted")(
+    common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ReaderSerializeState")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ReaderRead")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ReaderReadUpTo")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ReaderReset")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ReaderRestoreState")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ReadFile")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("MatchingFiles")(common_shapes.call_cpp_shape_fn)
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 217556df3c7..e94ff063b7b 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import collections as collections_lib
 import contextlib
+import functools
 import traceback
 
 import six
@@ -35,8 +36,8 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 
 __all__ = ["VariableScope", "get_variable_scope",
-           "get_variable", "variable_scope", "variable_op_scope",
-           "no_regularizer"]
+           "get_variable", "get_local_variable", "variable_scope",
+           "variable_op_scope", "no_regularizer"]
 
 
 class _PartitionInfo(object):
@@ -1012,6 +1013,19 @@ def get_variable(name,
       custom_getter=custom_getter)
 
 
+@functools.wraps(get_variable)
+def get_local_variable(*args, **kwargs):
+  kwargs["trainable"] = False
+  if "collections" in kwargs:
+    kwargs["collections"] += [ops.GraphKeys.LOCAL_VARIABLES]
+  else:
+    kwargs["collections"] = [ops.GraphKeys.LOCAL_VARIABLES]
+  get_local_variable.__doc__ = (
+      "Gets an existing local variable or creates a new one.\n\n" +
+      get_local_variable.__doc__)
+  return get_variable(*args, **kwargs)
+
+
 def _get_partitioned_variable(name,
                               shape=None,
                               dtype=None,
diff --git a/tensorflow/python/summary/event_accumulator.py b/tensorflow/python/summary/event_accumulator.py
index 5c5ce00508f..1617924357f 100644
--- a/tensorflow/python/summary/event_accumulator.py
+++ b/tensorflow/python/summary/event_accumulator.py
@@ -21,6 +21,8 @@ import collections
 import os.path
 import threading
 
+import numpy as np
+
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf.config_pb2 import RunMetadata
 from tensorflow.core.util.event_pb2 import SessionLog
@@ -438,110 +440,23 @@ class EventAccumulator(object):
       self.most_recent_step = event.step
       self.most_recent_wall_time = event.wall_time
 
-  def _Percentile(self, compression_bps, bucket_limit, cumsum_weights,
-                  histo_min, histo_max, histo_num):
-    """Linearly interpolates a histogram weight for a particular basis point.
-
-    Uses clamping methods on `histo_min` and `histo_max` to produce tight
-    linear estimates of the histogram weight at a particular basis point.
-
-    Args:
-      compression_bps: The desired basis point at which to estimate the weight
-      bucket_limit: An array of the RHS histogram bucket limits
-      cumsum_weights: A cumulative sum of the fraction of weights in each
-        histogram bucket, represented in basis points.
-      histo_min: The minimum weight observed in the weight histogram
-      histo_max: The maximum weight observed in the weight histogram
-      histo_num: The number of items in the weight histogram
-
-    Returns:
-      A linearly interpolated value of the histogram weight estimate.
-    """
-    if histo_num == 0:
-      return 0
-
-    for i, cumsum in enumerate(cumsum_weights):
-      if cumsum >= compression_bps:
-        cumsum_prev = cumsum_weights[i - 1] if i > 0 else 0
-        # Prevent cumsum = 0, cumsum_prev = 0, lerp divide by zero.
-        if cumsum == cumsum_prev:
-          continue
-
-        # Calculate the lower bound of interpolation
-        lhs = bucket_limit[i - 1] if (i > 0 and cumsum_prev > 0) else histo_min
-        lhs = max(lhs, histo_min)
-
-        # Calculate the upper bound of interpolation
-        rhs = bucket_limit[i]
-        rhs = min(rhs, histo_max)
-
-        weight = _Remap(compression_bps, cumsum_prev, cumsum, lhs, rhs)
-        return weight
-
-    ## We have not exceeded cumsum, so return the max observed.
-    return histo_max
-
-  def _ProcessCompressedHistogram(self, tag, wall_time, step, histo):
-    """Processes a histogram by adding a compression to accumulated state.
-
-    Adds a compressed histogram by linearly interpolating histogram buckets to
-    represent the histogram weight at multiple compression points. Uses
-    self._compression_bps (passed to EventAccumulator constructor) as the
-    compression points (represented in basis points, 1/100ths of a precent).
-
-    Args:
-      tag: A string name of the tag for which histograms are retrieved.
-      wall_time: Time in seconds since epoch
-      step: Number of steps that have passed
-      histo: proto2 histogram Object
-    """
-
-    def _CumulativeSum(arr):
-      return [sum(arr[:i + 1]) for i in range(len(arr))]
-
-    # Convert from proto repeated field into a Python list.
-    bucket = list(histo.bucket)
-    bucket_limit = list(histo.bucket_limit)
-
-    bucket_total = sum(bucket)
-    if bucket_total == 0:
-      bucket_total = 1
-    fraction_weights = [10000 * x / bucket_total for x in bucket]
-    cumsum_weights = _CumulativeSum(fraction_weights)
-
-    percentiles = [
-        self._Percentile(bps, bucket_limit, cumsum_weights, histo.min,
-                         histo.max, histo.num) for bps in self._compression_bps
-    ]
-
-    compressed_histogram_values = [CompressedHistogramValue(
-        basis_point=bps,
-        value=value) for bps, value in zip(self._compression_bps, percentiles)]
-    histogram_event = CompressedHistogramEvent(
-        wall_time=wall_time,
-        step=step,
-        compressed_histogram_values=compressed_histogram_values)
-
-    self._compressed_histograms.AddItem(tag, histogram_event)
+  def _ConvertHistogramProtoToTuple(self, histo):
+    return HistogramValue(min=histo.min,
+                          max=histo.max,
+                          num=histo.num,
+                          sum=histo.sum,
+                          sum_squares=histo.sum_squares,
+                          bucket_limit=list(histo.bucket_limit),
+                          bucket=list(histo.bucket))
 
   def _ProcessHistogram(self, tag, wall_time, step, histo):
-    """Processes a histogram by adding it to accumulated state."""
-
-    # Also process the compressed histogram
-    self._ProcessCompressedHistogram(tag, wall_time, step, histo)
-
-    histogram_value = HistogramValue(min=histo.min,
-                                     max=histo.max,
-                                     num=histo.num,
-                                     sum=histo.sum,
-                                     sum_squares=histo.sum_squares,
-                                     # Convert from proto repeated to list.
-                                     bucket_limit=list(histo.bucket_limit),
-                                     bucket=list(histo.bucket),)
-    histogram_event = HistogramEvent(wall_time=wall_time,
-                                     step=step,
-                                     histogram_value=histogram_value,)
-    self._histograms.AddItem(tag, histogram_event)
+    """Processes a proto histogram by adding it to accumulated state."""
+    histo = self._ConvertHistogramProtoToTuple(histo)
+    self._histograms.AddItem(tag, HistogramEvent(wall_time, step, histo))
+    self._compressed_histograms.AddItem(
+        tag,
+        CompressedHistogramEvent(
+            wall_time, step, _CompressHistogram(histo, self._compression_bps)))
 
   def _ProcessImage(self, tag, wall_time, step, image):
     """Processes an image by adding it to accumulated state."""
@@ -658,6 +573,55 @@ def _ParseFileVersion(file_version):
     return -1
 
 
+def _CompressHistogram(histo, bps):
+  """Creates fixed size histogram by adding compression to accumulated state.
+
+  This routine transforms a histogram at a particular step by linearly
+  interpolating its variable number of buckets to represent their cumulative
+  weight at a constant number of compression points. This significantly reduces
+  the size of the histogram and makes it suitable for a two-dimensional area
+  plot where the output of this routine constitutes the ranges for a single x
+  coordinate.
+
+  Args:
+    histo: A HistogramValue namedtuple.
+    bps: Compression points represented in basis points, 1/100ths of a percent.
+
+  Returns:
+    List of CompressedHistogramValue namedtuples.
+  """
+  # See also: Histogram::Percentile() in core/lib/histogram/histogram.cc
+  if not histo.num:
+    return [CompressedHistogramValue(b, 0.0) for b in bps]
+  bucket = np.array(histo.bucket)
+  weights = (bucket * bps[-1] / (bucket.sum() or 1.0)).cumsum()
+  values = []
+  j = 0
+  while j < len(bps):
+    i = np.searchsorted(weights, bps[j], side='right')
+    while i < len(weights):
+      cumsum = weights[i]
+      cumsum_prev = weights[i - 1] if i > 0 else 0.0
+      if cumsum == cumsum_prev:  # prevent remap divide by zero
+        i += 1
+        continue
+      if not i or not cumsum_prev:
+        lhs = histo.min
+      else:
+        lhs = max(histo.bucket_limit[i - 1], histo.min)
+      rhs = min(histo.bucket_limit[i], histo.max)
+      weight = _Remap(bps[j], cumsum_prev, cumsum, lhs, rhs)
+      values.append(CompressedHistogramValue(bps[j], weight))
+      j += 1
+      break
+    else:
+      break
+  while j < len(bps):
+    values.append(CompressedHistogramValue(bps[j], histo.max))
+    j += 1
+  return values
+
+
 def _Remap(x, x0, x1, y0, y1):
   """Linearly map from [x0, x1] unto [y0, y1]."""
   return y0 + (x - x0) * float(y1 - y0) / (x1 - x0)
diff --git a/tensorflow/python/summary/event_accumulator_test.py b/tensorflow/python/summary/event_accumulator_test.py
index 3349e73f04e..a8db2c50e9b 100644
--- a/tensorflow/python/summary/event_accumulator_test.py
+++ b/tensorflow/python/summary/event_accumulator_test.py
@@ -307,91 +307,38 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
         compressed_histogram_values=expected_vals2)
     self.assertEqual(acc.CompressedHistograms('hst2'), [expected_cmphst2])
 
-  def testPercentile(self):
-
-    def AssertExpectedForBps(bps, expected):
-      output = acc._Percentile(bps, bucket_limit, cumsum_weights, histo_min,
-                               histo_max, histo_num)
-      self.assertAlmostEqual(expected, output)
-
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-
-    bucket_limit = [1, 2, 3, 4]
-    histo_num = 100
-
-    ## All weights in the first bucket
-    cumsum_weights = [10000, 10000, 10000, 10000]
-    histo_min = -1
-    histo_max = .9
-    AssertExpectedForBps(0, histo_min)
-    AssertExpectedForBps(2500, ea._Remap(2500, 0, 10000, histo_min, histo_max))
-    AssertExpectedForBps(5000, ea._Remap(5000, 0, 10000, histo_min, histo_max))
-    AssertExpectedForBps(7500, ea._Remap(7500, 0, 10000, histo_min, histo_max))
-    AssertExpectedForBps(10000, histo_max)
-
-    ## All weights in second bucket
-    cumsum_weights = [0, 10000, 10000, 10000]
-    histo_min = 1.1
-    histo_max = 1.8
-    AssertExpectedForBps(0, histo_min)
-    AssertExpectedForBps(2500, ea._Remap(2500, 0, 10000, histo_min, histo_max))
-    AssertExpectedForBps(5000, ea._Remap(5000, 0, 10000, histo_min, histo_max))
-    AssertExpectedForBps(7500, ea._Remap(7500, 0, 10000, histo_min, histo_max))
-    AssertExpectedForBps(10000, histo_max)
-
-    ## All weights in the last bucket
-    cumsum_weights = [0, 0, 0, 10000]
-    histo_min = 3.1
-    histo_max = 3.6
-    AssertExpectedForBps(0, histo_min)
-    AssertExpectedForBps(2500, ea._Remap(2500, 0, 10000, histo_min, histo_max))
-    AssertExpectedForBps(5000, ea._Remap(5000, 0, 10000, histo_min, histo_max))
-    AssertExpectedForBps(7500, ea._Remap(7500, 0, 10000, histo_min, histo_max))
-    AssertExpectedForBps(10000, histo_max)
-
-    ## Weights distributed between two buckets
-    cumsum_weights = [0, 4000, 10000, 10000]
-    histo_min = 1.1
-    histo_max = 2.9
-    AssertExpectedForBps(0, histo_min)
-    AssertExpectedForBps(2500, ea._Remap(2500, 0, 4000, histo_min,
-                                         bucket_limit[1]))
-    AssertExpectedForBps(5000, ea._Remap(5000, 4000, 10000, bucket_limit[1],
-                                         histo_max))
-    AssertExpectedForBps(7500, ea._Remap(7500, 4000, 10000, bucket_limit[1],
-                                         histo_max))
-    AssertExpectedForBps(10000, histo_max)
-
-    ## Weights distributed between all buckets
-    cumsum_weights = [1000, 4000, 8000, 10000]
-    histo_min = -1
-    histo_max = 3.9
-    AssertExpectedForBps(0, histo_min)
-    AssertExpectedForBps(2500, ea._Remap(2500, 1000, 4000, bucket_limit[0],
-                                         bucket_limit[1]))
-    AssertExpectedForBps(5000, ea._Remap(5000, 4000, 8000, bucket_limit[1],
-                                         bucket_limit[2]))
-    AssertExpectedForBps(7500, ea._Remap(7500, 4000, 8000, bucket_limit[1],
-                                         bucket_limit[2]))
-    AssertExpectedForBps(9000, ea._Remap(9000, 8000, 10000, bucket_limit[2],
-                                         histo_max))
-    AssertExpectedForBps(10000, histo_max)
-
-    ## Most weight in first bucket
-    cumsum_weights = [9000, 10000, 10000, 10000]
-    histo_min = -1
-    histo_max = 1.1
-    AssertExpectedForBps(0, histo_min)
-    AssertExpectedForBps(2500, ea._Remap(2500, 0, 9000, histo_min,
-                                         bucket_limit[0]))
-    AssertExpectedForBps(5000, ea._Remap(5000, 0, 9000, histo_min,
-                                         bucket_limit[0]))
-    AssertExpectedForBps(7500, ea._Remap(7500, 0, 9000, histo_min,
-                                         bucket_limit[0]))
-    AssertExpectedForBps(9500, ea._Remap(9500, 9000, 10000, bucket_limit[0],
-                                         histo_max))
-    AssertExpectedForBps(10000, histo_max)
+  def testCompressHistogram_uglyHistogram(self):
+    bps = (0, 668, 1587, 3085, 5000, 6915, 8413, 9332, 10000)
+    vals = ea._CompressHistogram(
+        ea.HistogramValue(
+            min=0.0,
+            max=1.0,
+            num=960.0,
+            sum=64.0,
+            sum_squares=64.0,
+            bucket_limit=[
+                0.0,
+                1e-12,
+                0.917246389039776,
+                1.0089710279437536,
+                1.7976931348623157e+308],
+            bucket=[
+                0.0,
+                896.0,
+                0.0,
+                64.0,
+                0.0]),
+        bps)
+    self.assertEquals(tuple(v.basis_point for v in vals), bps)
+    self.assertAlmostEqual(vals[0].value, 0.0)
+    self.assertAlmostEqual(vals[1].value, 7.157142857142856e-14)
+    self.assertAlmostEqual(vals[2].value, 1.7003571428571426e-13)
+    self.assertAlmostEqual(vals[3].value, 3.305357142857143e-13)
+    self.assertAlmostEqual(vals[4].value, 5.357142857142857e-13)
+    self.assertAlmostEqual(vals[5].value, 7.408928571428571e-13)
+    self.assertAlmostEqual(vals[6].value, 9.013928571428571e-13)
+    self.assertAlmostEqual(vals[7].value, 9.998571428571429e-13)
+    self.assertAlmostEqual(vals[8].value, 1.0)
 
   def testImages(self):
     gen = _EventGenerator()
diff --git a/tensorflow/python/training/training_ops.py b/tensorflow/python/training/training_ops.py
index b1401e51759..fbb5bb32c1b 100644
--- a/tensorflow/python/training/training_ops.py
+++ b/tensorflow/python/training/training_ops.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.training import gen_training_ops
@@ -48,246 +49,23 @@ def _AssertInputIsScalar(op, index):
   op.inputs[index].get_shape().assert_is_compatible_with(tensor_shape.scalar())
 
 
-@ops.RegisterShape("ApplyAdadelta")
-def _ApplyAdadeltaShape(op):
-  """Shape function for the ApplyAdadelta op."""
-  var_shape = op.inputs[0].get_shape()
-  accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  accum_update_shape = op.inputs[2].get_shape().merge_with(var_shape)
-  _AssertInputIsScalar(op, 3)  # lr
-  _AssertInputIsScalar(op, 4)  # rho
-  _AssertInputIsScalar(op, 5)  # epsilon
-  grad_shape = op.inputs[6].get_shape().merge_with(accum_shape)
-  return [grad_shape]
-
-@ops.RegisterShape("ApplyAdagrad")
-def _ApplyAdagradShape(op):
-  """Shape function for the ApplyAdagrad op."""
-  var_shape = op.inputs[0].get_shape()
-  accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  _AssertInputIsScalar(op, 2)  # lr
-  grad_shape = op.inputs[3].get_shape().merge_with(accum_shape)
-  return [grad_shape]
-
-@ops.RegisterShape("ApplyProximalAdagrad")
-def _ApplyProximalAdagradShape(op):
-  """Shape function for the ApplyProximalAdagrad op."""
-  var_shape = op.inputs[0].get_shape()
-  accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  _AssertInputIsScalar(op, 2)  # lr
-  _AssertInputIsScalar(op, 3)  # l1
-  _AssertInputIsScalar(op, 4)  # l2
-  grad_shape = op.inputs[5].get_shape().merge_with(accum_shape)
-  return [grad_shape]
-
-
-@ops.RegisterShape("ApplyFtrl")
-def _ApplyFtrlShape(op):
-  """Shape function for the ApplyFtrlOp op."""
-  var_shape = op.inputs[0].get_shape()
-  accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  linear_shape = op.inputs[2].get_shape().merge_with(accum_shape)
-  grad_shape = op.inputs[3].get_shape().merge_with(linear_shape)
-  _AssertInputIsScalar(op, 4)  # lr
-  _AssertInputIsScalar(op, 5)  # l1
-  _AssertInputIsScalar(op, 6)  # l2
-  _AssertInputIsScalar(op, 7)  # lr_power
-  return [grad_shape]
-
-
-@ops.RegisterShape("ApplyAdagradDA")
-def ApplyAdagradDAShape(op):
-  """Shape function for the ApplyAdagradDAOp op."""
-  var_shape = op.inputs[0].get_shape()
-  g_accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  gg_accum_shape = op.inputs[2].get_shape().merge_with(g_accum_shape)
-  grad_shape = op.inputs[3].get_shape().merge_with(gg_accum_shape)
-  _AssertInputIsScalar(op, 4)  # lr
-  _AssertInputIsScalar(op, 5)  # l1
-  _AssertInputIsScalar(op, 6)  # l2
-  _AssertInputIsScalar(op, 7)  # global_step
-  return [grad_shape]
-
-
-@ops.RegisterShape("ApplyAdam")
-def _ApplyAdamShape(op):
-  """Shape function for the ApplyAdam op."""
-  var_shape = op.inputs[0].get_shape()
-  m_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  v_shape = op.inputs[2].get_shape().merge_with(m_shape)
-  _AssertInputIsScalar(op, 3)  # beta1_power
-  _AssertInputIsScalar(op, 4)  # beta2_power
-  _AssertInputIsScalar(op, 5)  # lr
-  _AssertInputIsScalar(op, 6)  # beta1
-  _AssertInputIsScalar(op, 7)  # beta2
-  _AssertInputIsScalar(op, 8)  # epsilon
-  grad_shape = op.inputs[9].get_shape().merge_with(v_shape)
-  return [grad_shape]
-
-
-@ops.RegisterShape("ApplyMomentum")
-def _ApplyMomentumShape(op):
-  """Shape function for the ApplyMomentum op."""
-  var_shape = op.inputs[0].get_shape()
-  accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  _AssertInputIsScalar(op, 2)  # lr
-  grad_shape = op.inputs[3].get_shape().merge_with(accum_shape)
-  _AssertInputIsScalar(op, 4)  # momentum
-  return [grad_shape]
-
-
-@ops.RegisterShape("ApplyRMSProp")
-def _ApplyRMSPropShape(op):
-  """Shape function for the ApplyRMSProp op."""
-  var_shape = op.inputs[0].get_shape()
-  ms_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  mom_shape = op.inputs[2].get_shape().merge_with(ms_shape)
-  _AssertInputIsScalar(op, 3)  # lr
-  _AssertInputIsScalar(op, 4)  # rho
-  _AssertInputIsScalar(op, 5)  # momentum
-  _AssertInputIsScalar(op, 6)  # epsilon
-  grad_shape = op.inputs[7].get_shape().merge_with(mom_shape)
-  return [grad_shape]
-
-
-@ops.RegisterShape("ApplyGradientDescent")
-def _ApplyGradientDescentShape(op):
-  """Shape function for the ApplyGradientDescent op."""
-  var_shape = op.inputs[0].get_shape()
-  _AssertInputIsScalar(op, 1)  # alpha
-  delta_shape = op.inputs[2].get_shape().merge_with(var_shape)
-  return [delta_shape]
-
-
-@ops.RegisterShape("ApplyProximalGradientDescent")
-def _ApplyProximalGradientDescentShape(op):
-  """Shape function for the ApplyProximalGradientDescent op."""
-  var_shape = op.inputs[0].get_shape()
-  _AssertInputIsScalar(op, 1)  # alpha
-  _AssertInputIsScalar(op, 2)  # l1
-  _AssertInputIsScalar(op, 3)  # l2
-  delta_shape = op.inputs[4].get_shape().merge_with(var_shape)
-  return [delta_shape]
-
-
-@ops.RegisterShape("SparseApplyProximalGradientDescent")
-def _SparseApplyProximalGradientDescentShape(op):
-  """Shape function for the SparseApplyGradientDescent op."""
-  var_shape = op.inputs[0].get_shape()
-  _AssertInputIsScalar(op, 1)  # lr
-  _AssertInputIsScalar(op, 2)  # l1
-  _AssertInputIsScalar(op, 3)  # l2
-  grad_shape = op.inputs[4].get_shape().merge_with(
-      tensor_shape.TensorShape([None]).concatenate(var_shape[1:]))
-  unused_indices_shape = op.inputs[5].get_shape().merge_with(
-      tensor_shape.vector(grad_shape[0]))
-  return [var_shape]
-
-
-@ops.RegisterShape("SparseApplyRMSProp")
-def _SparseApplyRMSPropShape(op):
-  """Shape function for the SparseApplyRMSProp op."""
-  var_shape = op.inputs[0].get_shape()
-  ms_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  mom_shape = op.inputs[2].get_shape().merge_with(ms_shape)
-  _AssertInputIsScalar(op, 3)  # lr
-  _AssertInputIsScalar(op, 4)  # rho
-  _AssertInputIsScalar(op, 5)  # momentum
-  _AssertInputIsScalar(op, 6)  # epsilon
-  grad_shape = op.inputs[7].get_shape().merge_with(
-      tensor_shape.TensorShape([None]).concatenate(mom_shape[1:]))
-  unused_indices_shape = op.inputs[8].get_shape().merge_with(
-      tensor_shape.vector(grad_shape[0]))
-  return [mom_shape]
-
-
-@ops.RegisterShape("SparseApplyAdadelta")
-def _SparseApplyAdadeltaShape(op):
-  """Shape function for the SparseApplyAdadelta op."""
-  var_shape = op.inputs[0].get_shape()
-  accum_grad_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  accum_update_shape = op.inputs[2].get_shape().merge_with(accum_grad_shape)
-  _AssertInputIsScalar(op, 3)  # lr
-  _AssertInputIsScalar(op, 4)  # decay_rate
-  _AssertInputIsScalar(op, 5)  # epsilon
-  grad_shape = op.inputs[6].get_shape().merge_with(
-      tensor_shape.TensorShape([None]).concatenate(accum_update_shape[1:]))
-  unused_indices_shape = op.inputs[7].get_shape().merge_with(
-      tensor_shape.vector(grad_shape[0]))
-  return [accum_update_shape]
-
-
-@ops.RegisterShape("SparseApplyAdagrad")
-def _SparseApplyAdagradShape(op):
-  """Shape function for the SparseApplyAdagrad op."""
-  var_shape = op.inputs[0].get_shape()
-  accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  _AssertInputIsScalar(op, 2)  # lr
-  grad_shape = op.inputs[3].get_shape().merge_with(
-      tensor_shape.TensorShape([None]).concatenate(accum_shape[1:]))
-  unused_indices_shape = op.inputs[4].get_shape().merge_with(
-      tensor_shape.vector(grad_shape[0]))
-  return [accum_shape]
-
-
-@ops.RegisterShape("SparseApplyProximalAdagrad")
-def _SparseApplyProximalAdagradShape(op):
-  """Shape function for the SparseApplyProximalAdagrad op."""
-  var_shape = op.inputs[0].get_shape()
-  accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  _AssertInputIsScalar(op, 2)  # lr
-  _AssertInputIsScalar(op, 3)  # l1
-  _AssertInputIsScalar(op, 4)  # l2
-  grad_shape = op.inputs[5].get_shape().merge_with(
-      tensor_shape.TensorShape([None]).concatenate(accum_shape[1:]))
-  unused_indices_shape = op.inputs[6].get_shape().merge_with(
-      tensor_shape.vector(grad_shape[0]))
-  return [accum_shape]
-
-
-@ops.RegisterShape("SparseApplyFtrl")
-def _SparseApplyFtrlShape(op):
-  """Shape function for the SparseApplyFtrl op."""
-  var_shape = op.inputs[0].get_shape()
-  accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  linear_shape = op.inputs[2].get_shape().merge_with(accum_shape)
-  grad_shape = op.inputs[3].get_shape().merge_with(
-      tensor_shape.TensorShape([None]).concatenate(linear_shape[1:]))
-  unused_indices_shape = op.inputs[4].get_shape().merge_with(
-      tensor_shape.vector(grad_shape[0]))
-  _AssertInputIsScalar(op, 5)  # lr
-  _AssertInputIsScalar(op, 6)  # l1
-  _AssertInputIsScalar(op, 7)  # l2
-  _AssertInputIsScalar(op, 8)  # lr_power
-  return [linear_shape]
-
-
-@ops.RegisterShape("SparseApplyAdagradDA")
-def _SparseApplyAdagradDAShape(op):
-  """Shape function for the SparseApplyAdagradDA op."""
-  var_shape = op.inputs[0].get_shape()
-  g_accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  gg_accum_shape = op.inputs[2].get_shape().merge_with(g_accum_shape)
-  grad_shape = op.inputs[3].get_shape().merge_with(
-      tensor_shape.TensorShape([None]).concatenate(gg_accum_shape[1:]))
-  unused_indices_shape = op.inputs[4].get_shape().merge_with(
-      tensor_shape.vector(grad_shape[0]))
-  _AssertInputIsScalar(op, 5)  # lr
-  _AssertInputIsScalar(op, 6)  # l1
-  _AssertInputIsScalar(op, 7)  # l2
-  _AssertInputIsScalar(op, 8)  # global_step
-  return [gg_accum_shape]
-
-
-@ops.RegisterShape("SparseApplyMomentum")
-def _SparseApplyMomentumShape(op):
-  """Shape function for the SparseApplyMomentum op."""
-  var_shape = op.inputs[0].get_shape()
-  accum_shape = op.inputs[1].get_shape().merge_with(var_shape)
-  _AssertInputIsScalar(op, 2)  # lr
-  grad_shape = op.inputs[3].get_shape().merge_with(
-      tensor_shape.TensorShape([None]).concatenate(accum_shape[1:]))
-  unused_indices_shape = op.inputs[4].get_shape().merge_with(
-      tensor_shape.vector(grad_shape[0]))
-  _AssertInputIsScalar(op, 5)  # momentum
-  return [accum_shape]
+ops.RegisterShape("ApplyAdadelta")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ApplyAdagrad")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ApplyProximalAdagrad")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ApplyFtrl")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ApplyAdagradDA")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ApplyAdam")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ApplyMomentum")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ApplyRMSProp")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ApplyGradientDescent")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("ApplyProximalGradientDescent")(
+    common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("SparseApplyProximalGradientDescent")(
+    common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("SparseApplyRMSProp")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("SparseApplyAdadelta")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("SparseApplyAdagrad")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("SparseApplyProximalAdagrad")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("SparseApplyFtrl")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("SparseApplyAdagradDA")(common_shapes.call_cpp_shape_fn)
+ops.RegisterShape("SparseApplyMomentum")(common_shapes.call_cpp_shape_fn)
diff --git a/tensorflow/tensorboard/components/vz-projector/bh_tsne.ts b/tensorflow/tensorboard/components/vz-projector/bh_tsne.ts
index 35a8328cd6d..db315ccda6c 100644
--- a/tensorflow/tensorboard/components/vz-projector/bh_tsne.ts
+++ b/tensorflow/tensorboard/components/vz-projector/bh_tsne.ts
@@ -375,7 +375,7 @@ export class TSNE {
     let annotateTree =
         (node: AugmSPNode): {numCells: number, yCell: number[]} => {
           let numCells = node.points ? node.points.length : 0;
-          if (node.children === null) {
+          if (node.children == null) {
             // Update the current node and tell the parent.
             node.numCells = numCells;
             // TODO(smilkov): yCell should be average across all points.
@@ -387,7 +387,7 @@ export class TSNE {
               node.points ? node.points[0].slice() : zerosArray(this.dim);
           for (let i = 0; i < node.children.length; ++i) {
             let child = node.children[i];
-            if (child === null) {
+            if (child == null) {
               continue;
             }
             let result = annotateTree(child as AugmSPNode);
@@ -432,7 +432,7 @@ export class TSNE {
       tree.visit((node: AugmSPNode) => {
         let squaredDistToCell = this.dist2(pointI, node.yCell);
         // Squared distance from point i to cell.
-        if (node.children === null ||
+        if (node.children == null ||
             (node.rCell / Math.sqrt(squaredDistToCell) < THETA)) {
           let qijZ = 1 / (1 + squaredDistToCell);
           let dZ = node.numCells * qijZ;
diff --git a/tensorflow/tensorboard/components/vz-projector/scatter.ts b/tensorflow/tensorboard/components/vz-projector/scatter.ts
index c6c6590272d..d389aa3f65e 100644
--- a/tensorflow/tensorboard/components/vz-projector/scatter.ts
+++ b/tensorflow/tensorboard/components/vz-projector/scatter.ts
@@ -93,8 +93,6 @@ export interface Scatter {
   highlightPoints(
       pointIndexes: number[], highlightStroke?: (index: number) => string,
       favorLabels?: (index: number) => boolean): void;
-  /** Whether to show labels or not. */
-  showLabels(show: boolean): void;
   /** Toggle between day and night modes. */
   setDayNightMode(isNight: boolean): void;
   /** Show/hide tick labels. */
diff --git a/tensorflow/tensorboard/components/vz-projector/scatterWebGL.ts b/tensorflow/tensorboard/components/vz-projector/scatterWebGL.ts
index a4608b571c9..f0533000fdf 100644
--- a/tensorflow/tensorboard/components/vz-projector/scatterWebGL.ts
+++ b/tensorflow/tensorboard/components/vz-projector/scatterWebGL.ts
@@ -245,8 +245,7 @@ export class ScatterWebGL implements Scatter {
   private height: number;
   private width: number;
   private mode: Mode;
-  /** Whether the user has turned labels on or off. */
-  private labelsAreOn = true;
+
   /** Whether the label canvas has been already cleared. */
   private labelCanvasIsCleared = true;
 
@@ -1024,20 +1023,18 @@ export class ScatterWebGL implements Scatter {
    * collision grid.
    */
   private makeLabels() {
-    // Don't make labels if they are turned off.
-    if (!this.labelsAreOn || this.points == null) {
+    if (this.points == null) {
       return;
     }
     // First, remove all old labels.
     this.removeAllLabels();
 
-    this.labelCanvasIsCleared = false;
-    // If we are passed no points to label (that is, not mousing over any
-    // points) then want to label ALL the points that we can.
     if (!this.labeledPoints.length) {
-      this.labeledPoints = this.shuffledData;
+      return;
     }
 
+    this.labelCanvasIsCleared = false;
+
     // We never render more than ~500 labels, so when we get much past that
     // point, just break.
     let numRenderedLabels: number = 0;
@@ -1435,15 +1432,6 @@ export class ScatterWebGL implements Scatter {
 
   getHighlightedPoints(): number[] { return this.highlightedPoints; }
 
-  showLabels(show: boolean) {
-    this.labelsAreOn = show;
-    if (this.labelsAreOn) {
-      this.makeLabels();
-    } else {
-      this.removeAllLabels();
-    }
-  }
-
   /**
    * Toggles between day and night mode (resets corresponding variables for
    * color, etc.)
diff --git a/tensorflow/tensorboard/components/vz-projector/vector.ts b/tensorflow/tensorboard/components/vz-projector/vector.ts
index 82fef77b7b0..5f7914ef7b9 100644
--- a/tensorflow/tensorboard/components/vz-projector/vector.ts
+++ b/tensorflow/tensorboard/components/vz-projector/vector.ts
@@ -183,7 +183,7 @@ export type Predicate<T> = (a: T) => boolean;
 export function centroid<T>(
     dataPoints: T[], predicate: Predicate<T>,
     accessor?: (a: T) => Vector): {centroid: Vector, numMatches: number} {
-  if (accessor === null) {
+  if (accessor == null) {
     accessor = (a: T) => <any>a;
   }
   assert(dataPoints.length >= 0, '`vectors` must be of length >= 1');
diff --git a/tensorflow/tensorboard/components/vz-projector/vz-projector-data-loader.ts b/tensorflow/tensorboard/components/vz-projector/vz-projector-data-loader.ts
index 6b7515c2ddf..7c22c536566 100644
--- a/tensorflow/tensorboard/components/vz-projector/vz-projector-data-loader.ts
+++ b/tensorflow/tensorboard/components/vz-projector/vz-projector-data-loader.ts
@@ -216,7 +216,7 @@ class DataLoader extends DataLoaderPolymer {
 
     // Demo dataset dropdown
     let demoDatasetChanged = (demoDataSet: DemoDataset) => {
-      if (demoDataSet === null) {
+      if (demoDataSet == null) {
         return;
       }
 
@@ -409,7 +409,7 @@ function parseTensors(content: string, delim = '\t'): Promise<DataPoint[]> {
         dataPoint.vector = row.map(Number);
       }
       data.push(dataPoint);
-      if (numDim === null) {
+      if (numDim == null) {
         numDim = dataPoint.vector.length;
       }
       if (numDim !== dataPoint.vector.length) {
diff --git a/tensorflow/tensorboard/components/vz-projector/vz-projector.html b/tensorflow/tensorboard/components/vz-projector/vz-projector.html
index 25a90bb3d81..62653591b69 100644
--- a/tensorflow/tensorboard/components/vz-projector/vz-projector.html
+++ b/tensorflow/tensorboard/components/vz-projector/vz-projector.html
@@ -551,10 +551,6 @@ paper-listbox .pca-item {
         <i class="material-icons">photo_size_select_small</i>
         Select
       </button>
-      <button class="menu-button show-labels selected" title="Show/hide labels">
-        <i class="material-icons">text_fields</i>
-        Labels
-      </button>
       <button class="menu-button nightDayMode" title="Toggle between night and day mode">
         <i class="material-icons">brightness_2</i>
         Night Mode
diff --git a/tensorflow/tensorboard/components/vz-projector/vz-projector.ts b/tensorflow/tensorboard/components/vz-projector/vz-projector.ts
index cf4492d080c..f4c4d59d7bd 100644
--- a/tensorflow/tensorboard/components/vz-projector/vz-projector.ts
+++ b/tensorflow/tensorboard/components/vz-projector/vz-projector.ts
@@ -404,14 +404,6 @@ class Projector extends ProjectorPolymer {
       this.updateMenuButtons();
     });
 
-    let showLabels = true;
-    let showLabelsButton = this.dom.select('.show-labels');
-    showLabelsButton.on('click', () => {
-      showLabels = !showLabels;
-      this.scatter.showLabels(showLabels);
-      showLabelsButton.classed('selected', showLabels);
-    });
-
     let dayNightModeButton = this.dom.select('.nightDayMode');
     let modeIsNight = dayNightModeButton.classed('selected');
     dayNightModeButton.on('click', () => {
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 3d367d8ac64..565ecdce987 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -813,3 +813,18 @@ def tf_genrule_cmd_append_to_srcs(to_append):
     return ("cat $(SRCS) > $(@) && " +
             "echo >> $(@) && " +
             "echo " + to_append + " >> $(@)")
+
+
+def tf_version_info_genrule():
+  native.genrule(
+      name = "version_info_gen",
+      srcs = [
+          "//tensorflow/tools/git:gen/spec.json",
+          "//tensorflow/tools/git:gen/head",
+          "//tensorflow/tools/git:gen/branch_ref",
+      ],
+      outs = ["util/version_info.cc"],
+      cmd = "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"",
+      local = 1,
+      tools = ["//tensorflow/tools/git:gen_git_source.py"],
+  )
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index e7749ab0f70..cb81e89922c 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -1,2 +1,3 @@
 *tensorflow*
 *perftools*gputools*
+*tf_*
diff --git a/tensorflow/tools/dist_test/Dockerfile b/tensorflow/tools/dist_test/Dockerfile
index 4440f4cbbec..b3e7489e071 100644
--- a/tensorflow/tools/dist_test/Dockerfile
+++ b/tensorflow/tools/dist_test/Dockerfile
@@ -20,7 +20,7 @@ RUN /var/gcloud/google-cloud-sdk/bin/gcloud components install kubectl
 # Install nightly TensorFlow pip
 # TODO(cais): Should we build it locally instead?
 RUN pip install \
-    https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
+    https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Copy test files
 COPY scripts /var/tf-dist-test/scripts
diff --git a/tensorflow/tools/dist_test/server/Dockerfile b/tensorflow/tools/dist_test/server/Dockerfile
index f06de68f85d..6876621299c 100644
--- a/tensorflow/tools/dist_test/server/Dockerfile
+++ b/tensorflow/tools/dist_test/server/Dockerfile
@@ -36,7 +36,7 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
 
 # Install TensorFlow CPU version from nightly build
 RUN pip --no-cache-dir install \
-    https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
+    https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Copy files, including the GRPC server binary at
 # server/grpc_tensorflow_server.py
diff --git a/tensorflow/tools/dist_test/server/Dockerfile.test b/tensorflow/tools/dist_test/server/Dockerfile.test
index ffb0ba9d24b..1f7d2bdc051 100644
--- a/tensorflow/tools/dist_test/server/Dockerfile.test
+++ b/tensorflow/tools/dist_test/server/Dockerfile.test
@@ -42,7 +42,7 @@ RUN pip install --upgrade pandas==0.18.1
 
 # Install TensorFlow CPU version.
 RUN pip --no-cache-dir install \
-    https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
+    https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Copy files, including the GRPC server binary at
 # server/grpc_tensorflow_server.py
diff --git a/tensorflow/tools/gcs_test/Dockerfile b/tensorflow/tools/gcs_test/Dockerfile
index d377636fe4f..a9d2e8a3ae5 100644
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@@ -16,7 +16,7 @@ RUN ./install_google_cloud_sdk.bash --disable-prompts --install-dir=/var/gcloud
 
 # Install nightly TensorFlow pip
 RUN pip install \
-   https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
+   https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Copy test files
 RUN mkdir -p /gcs-smoke/python
diff --git a/tensorflow/tools/git/BUILD b/tensorflow/tools/git/BUILD
new file mode 100644
index 00000000000..f502c8dde07
--- /dev/null
+++ b/tensorflow/tools/git/BUILD
@@ -0,0 +1,28 @@
+# Description:
+# Contains script to generate tensorflow/core/util/version_info.cc
+# Also contains information about git repository deposited by configure
+# in gen/...
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(
+    glob(["gen/*"]) + [
+        "gen_git_source.py",
+    ],
+)
+
+# -----------------------------------------------------------------------------
+# Google-internal targets.  These must be at the end for syncrepo.
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
new file mode 100755
index 00000000000..6c0770b1ffa
--- /dev/null
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -0,0 +1,223 @@
+#!/usr/bin/python
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Help include git hash in tensorflow bazel build.
+
+This creates symlinks from the internal git repository directory so
+that the build system can see changes in the version state. We also
+remember what branch git was on so when the branch changes we can
+detect that the ref file is no longer correct (so we can suggest users
+run ./configure again).
+
+NOTE: this script is only used in opensource.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+import json
+import os
+import shutil
+
+
+def parse_branch_ref(filename):
+  """Given a filename of a .git/HEAD file return ref path.
+
+  In particular, if git is in detached head state, this will
+  return None. If git is in attached head, it will return
+  the branch reference. E.g. if on 'master', the HEAD will
+  contain 'ref: refs/heads/master' so 'refs/heads/master'
+  will be returned.
+
+  Example: parse_branch_ref(".git/HEAD")
+  Args:
+    filename: file to treat as a git HEAD file
+  Returns:
+    None if detached head, otherwise ref subpath
+  Raises:
+    RuntimeError: if the HEAD file is unparseable.
+  """
+
+  data = open(filename).read().strip()
+  items = data.split(" ")
+  if len(items) == 1:
+    return None
+  elif len(items) == 2 and items[0] == "ref:":
+    return items[1].strip()
+  else:
+    raise RuntimeError("Git directory has unparseable HEAD")
+
+
+def configure(src_base_path, debug=False):
+  """Configure `src_base_path` to embed git hashes if available."""
+
+  # TODO(aselle): No files generated or symlinked here are deleted by
+  # the build system. I don't know of a way to do it in bazel. It
+  # should only be a problem if somebody moves a sandbox directory
+  # without running ./configure again.
+
+  git_path = os.path.join(src_base_path, ".git")
+  gen_path = os.path.join(src_base_path, "tensorflow", "tools", "git", "gen")
+
+  # Remove and recreate the path
+  if os.path.exists(gen_path):
+    if os.path.isdir(gen_path):
+      shutil.rmtree(gen_path)
+    else:
+      raise RuntimeError("Cannot delete non-directory %s, inspect ",
+                         "and remove manually" % gen_path)
+  os.makedirs(gen_path)
+
+  if not os.path.isdir(gen_path):
+    raise RuntimeError("gen_git_source.py: Failed to create dir")
+
+  # file that specifies what the state of the git repo is
+  spec = {}
+
+  # value file names will be mapped to the keys
+  link_map = {"head": None, "branch_ref": None}
+
+  if not os.path.isdir(git_path):
+    # No git directory
+    spec["git"] = False
+    open(os.path.join(gen_path, "head"), "w").write("")
+    open(os.path.join(gen_path, "branch_ref"), "w").write("")
+  else:
+    # Git directory, possibly detached or attached
+    spec["git"] = True
+    spec["path"] = src_base_path
+    git_head_path = os.path.join(git_path, "HEAD")
+    spec["branch"] = parse_branch_ref(git_head_path)
+    link_map["head"] = git_head_path
+    if spec["branch"] is not None:
+      # attached method
+      link_map["branch_ref"] = os.path.join(git_path, *
+                                            os.path.split(spec["branch"]))
+  # Create symlinks or dummy files
+  for target, src in link_map.items():
+    if src is None:
+      open(os.path.join(gen_path, target), "w").write("")
+    else:
+      os.symlink(src, os.path.join(gen_path, target))
+
+  json.dump(spec, open(os.path.join(gen_path, "spec.json"), "w"), indent=2)
+  if debug:
+    print("gen_git_source.py: list %s" % gen_path)
+    print("gen_git_source.py: %s" + repr(os.listdir(gen_path)))
+    print("gen_git_source.py: spec is %r" % spec)
+
+
+def generate(arglist):
+  """Generate version_info.cc as given `destination_file`.
+
+  Args:
+    arglist: should be a sequence that contains
+             spec, head_symlink, ref_symlink, destination_file.
+
+  `destination_file` is the filename where version_info.cc will be written
+
+  `spec` is a filename where the file contains a JSON dictionary
+    'git' bool that is true if the source is in a git repo
+    'path' base path of the source code
+    'branch' the name of the ref specification of the current branch/tag
+
+  `head_symlink` is a filename to HEAD that is cross-referenced against
+    what is contained in the json branch designation.
+
+  `ref_symlink` is unused in this script but passed, because the build
+    system uses that file to detect when commits happen.
+
+  Raises:
+    RuntimeError: If ./configure needs to be run, RuntimeError will be raised.
+  """
+
+  # unused ref_symlink arg
+  spec, head_symlink, _, dest_file = arglist
+  data = json.load(open(spec))
+  strs = {"tf_compiler_version": "__VERSION__"}
+  if not data["git"]:
+    strs["tf_git_version"] = "internal"
+  else:
+    old_branch = data["branch"]
+    new_branch = parse_branch_ref(head_symlink)
+    if new_branch != old_branch:
+      raise RuntimeError(
+          "Run ./configure again, branch was '%s' but is now '%s'" %
+          (old_branch, new_branch))
+    strs["tf_git_version"] = os.popen(
+        "git -C \"%s\" describe --long --dirty --tags" %
+        (data["path"],)).read().strip()
+  # TODO(aselle): Check for escaping
+  cpp_file = "\n".join("const char* %s() {return \"%s\";}" % (x, y)
+                       for x, y in strs.items())
+  open(dest_file, "w").write(cpp_file + "\n")
+
+
+def raw_generate(output_file):
+  """Simple generator used for cmake/make build systems.
+
+  This does not create any symlinks. It requires the build system
+  to build unconditionally.
+
+  Args:
+    output_file: Output filename for the version info cc
+  """
+
+  strs = {"tf_compiler_version": "__VERSION__"}
+  version = os.popen("git describe --long --dirty --tags").read().strip()
+  version = version if version else "unknown"
+  strs["tf_git_version"] = version
+  cpp_file = "\n".join("const char* %s() {return \"%s\";}" % (x, y)
+                       for x, y in strs.items())
+  open(output_file, "w").write(cpp_file + "\n")
+
+
+parser = argparse.ArgumentParser(description="""Git hash injection into bazel.
+If used with --configure <path> will search for git directory and put symlinks
+into source so that a bazel genrule can call --generate""")
+
+parser.add_argument(
+    "--debug",
+    type=bool,
+    help="print debugging information about paths",
+    default=False)
+
+parser.add_argument(
+    "--configure", type=str,
+    help="Path to configure as a git repo dependency tracking sentinel")
+
+parser.add_argument(
+    "--generate",
+    type=str,
+    help="Generate given spec-file, HEAD-symlink-file, ref-symlink-file",
+    nargs="+")
+
+parser.add_argument(
+    "--raw_generate",
+    type=str,
+    help="Generate version_info.cc (simpler version used for cmake/make)")
+
+args = parser.parse_args()
+
+if args.configure is not None:
+  configure(args.configure, debug=args.debug)
+elif args.generate is not None:
+  generate(args.generate)
+elif args.raw_generate is not None:
+  raw_generate(args.raw_generate)
+else:
+  raise RuntimeError("--configure or --generate or --raw_generate "
+                     "must be used")