diff --git a/eigen.BUILD b/eigen.BUILD
index a657493380b..16dd4f84228 100644
--- a/eigen.BUILD
+++ b/eigen.BUILD
@@ -1,6 +1,6 @@
 package(default_visibility = ["//visibility:public"])
 
-archive_dir = "eigen-eigen-a5e9085a94e8"
+archive_dir = "eigen-eigen-f3a13643ac1f"
 
 cc_library(
     name = "eigen",
diff --git a/tensorflow/contrib/cmake/external/eigen.cmake b/tensorflow/contrib/cmake/external/eigen.cmake
index 42fa7686632..c1929a10f32 100644
--- a/tensorflow/contrib/cmake/external/eigen.cmake
+++ b/tensorflow/contrib/cmake/external/eigen.cmake
@@ -7,7 +7,7 @@
 
 include (ExternalProject)
 
-set(eigen_archive_hash "a5e9085a94e8")
+set(eigen_archive_hash "f3a13643ac1f")
 
 set(eigen_INCLUDE_DIRS
     ${CMAKE_CURRENT_BINARY_DIR}
@@ -16,7 +16,7 @@ set(eigen_INCLUDE_DIRS
     ${tensorflow_source_dir}/third_party/eigen3
 )
 set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
-set(eigen_HASH SHA256=967126237829c7c87abb6cd0e13a5a235b0377d51575522c390b9486aed13e71)
+set(eigen_HASH SHA256=a9266e60366cddb371a23d86b11a297eee86372a89ef4b38a3509012f9cc37ec)
 set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
 set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)
 
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 451a34320e0..44263dc8aed 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -55,9 +55,9 @@ cuda_py_tests(
 )
 
 cuda_py_tests(
-    name = "gaussian_test",
+    name = "normal_test",
     size = "small",
-    srcs = ["python/kernel_tests/gaussian_test.py"],
+    srcs = ["python/kernel_tests/normal_test.py"],
     additional_deps = [
         ":distributions_py",
         "//tensorflow/python:framework_test_lib",
@@ -98,9 +98,9 @@ cuda_py_tests(
 )
 
 cuda_py_tests(
-    name = "gaussian_conjugate_posteriors_test",
+    name = "normal_conjugate_posteriors_test",
     size = "small",
-    srcs = ["python/kernel_tests/gaussian_conjugate_posteriors_test.py"],
+    srcs = ["python/kernel_tests/normal_conjugate_posteriors_test.py"],
     additional_deps = [
         ":distributions_py",
         "//tensorflow/python:platform_test",
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 2c8a0343b28..7fa8c0fb0c6 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -30,7 +30,7 @@ initialized with parameters that define the distributions.
 @@Chi2
 @@Exponential
 @@Gamma
-@@Gaussian
+@@Normal
 @@StudentT
 @@Uniform
 
@@ -44,10 +44,10 @@ initialized with parameters that define the distributions.
 Functions that transform conjugate prior/likelihood pairs to distributions
 representing the posterior or posterior predictive.
 
-### Gaussian likelihood with conjugate prior.
+### Normal likelihood with conjugate prior.
 
-@@gaussian_conjugates_known_sigma_posterior
-@@gaussian_congugates_known_sigma_predictive
+@@normal_conjugates_known_sigma_posterior
+@@normal_congugates_known_sigma_predictive
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -60,8 +60,8 @@ from tensorflow.contrib.distributions.python.ops.dirichlet_multinomial import *
 from tensorflow.contrib.distributions.python.ops.distribution import *
 from tensorflow.contrib.distributions.python.ops.exponential import *
 from tensorflow.contrib.distributions.python.ops.gamma import *
-from tensorflow.contrib.distributions.python.ops.gaussian import *
-from tensorflow.contrib.distributions.python.ops.gaussian_conjugate_posteriors import *
 from tensorflow.contrib.distributions.python.ops.mvn import *
+from tensorflow.contrib.distributions.python.ops.normal import *
+from tensorflow.contrib.distributions.python.ops.normal_conjugate_posteriors import *
 from tensorflow.contrib.distributions.python.ops.student_t import *
 from tensorflow.contrib.distributions.python.ops.uniform import *
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py b/tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py
index 5e3fed1ed80..6fd03e90bf6 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py
@@ -105,10 +105,9 @@ class ExponentialTest(tf.test.TestCase):
 
       exponential = tf.contrib.distributions.Exponential(lam=lam)
 
-      n_v = 100000
-      n = tf.constant(n_v)
+      n = 100000
       samples = exponential.sample(n, seed=138)
-      self.assertEqual(samples.get_shape(), (n_v, batch_size, 2))
+      self.assertEqual(samples.get_shape(), (n, batch_size, 2))
 
       sample_values = samples.eval()
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/gaussian_conjugate_posteriors_test.py b/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py
similarity index 74%
rename from tensorflow/contrib/distributions/python/kernel_tests/gaussian_conjugate_posteriors_test.py
rename to tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py
index c3a2464b5bd..1d03396bf68 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/gaussian_conjugate_posteriors_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py
@@ -25,9 +25,9 @@ import tensorflow as tf
 distributions = tf.contrib.distributions
 
 
-class GaussianTest(tf.test.TestCase):
+class NormalTest(tf.test.TestCase):
 
-  def testGaussianConjugateKnownSigmaPosterior(self):
+  def testNormalConjugateKnownSigmaPosterior(self):
     with tf.Session():
       mu0 = tf.constant([3.0])
       sigma0 = tf.constant([math.sqrt(10.0)])
@@ -35,16 +35,16 @@ class GaussianTest(tf.test.TestCase):
       x = tf.constant([-2.5, 2.5, 4.0, 0.0, -1.0, 2.0])
       s = tf.reduce_sum(x)
       n = tf.size(x)
-      prior = distributions.Gaussian(mu=mu0, sigma=sigma0)
-      posterior = distributions.gaussian_conjugates_known_sigma_posterior(
+      prior = distributions.Normal(mu=mu0, sigma=sigma0)
+      posterior = distributions.normal_conjugates_known_sigma_posterior(
           prior=prior, sigma=sigma, s=s, n=n)
 
       # Smoke test
-      self.assertTrue(isinstance(posterior, distributions.Gaussian))
+      self.assertTrue(isinstance(posterior, distributions.Normal))
       posterior_log_pdf = posterior.log_pdf(x).eval()
       self.assertEqual(posterior_log_pdf.shape, (6,))
 
-  def testGaussianConjugateKnownSigmaPosteriorND(self):
+  def testNormalConjugateKnownSigmaPosteriorND(self):
     with tf.Session():
       batch_size = 6
       mu0 = tf.constant([[3.0, -3.0]] * batch_size)
@@ -54,16 +54,16 @@ class GaussianTest(tf.test.TestCase):
           tf.constant([[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0]], dtype=tf.float32))
       s = tf.reduce_sum(x)
       n = tf.size(x)
-      prior = distributions.Gaussian(mu=mu0, sigma=sigma0)
-      posterior = distributions.gaussian_conjugates_known_sigma_posterior(
+      prior = distributions.Normal(mu=mu0, sigma=sigma0)
+      posterior = distributions.normal_conjugates_known_sigma_posterior(
           prior=prior, sigma=sigma, s=s, n=n)
 
       # Smoke test
-      self.assertTrue(isinstance(posterior, distributions.Gaussian))
+      self.assertTrue(isinstance(posterior, distributions.Normal))
       posterior_log_pdf = posterior.log_pdf(x).eval()
       self.assertEqual(posterior_log_pdf.shape, (6, 2))
 
-  def testGaussianConjugateKnownSigmaNDPosteriorND(self):
+  def testNormalConjugateKnownSigmaNDPosteriorND(self):
     with tf.Session():
       batch_size = 6
       mu0 = tf.constant([[3.0, -3.0]] * batch_size)
@@ -75,19 +75,19 @@ class GaussianTest(tf.test.TestCase):
       s = tf.reduce_sum(x, reduction_indices=[1])
       x = tf.transpose(x)  # Reshape to shape (6, 2)
       n = tf.constant([6] * 2)
-      prior = distributions.Gaussian(mu=mu0, sigma=sigma0)
-      posterior = distributions.gaussian_conjugates_known_sigma_posterior(
+      prior = distributions.Normal(mu=mu0, sigma=sigma0)
+      posterior = distributions.normal_conjugates_known_sigma_posterior(
           prior=prior, sigma=sigma, s=s, n=n)
 
       # Smoke test
-      self.assertTrue(isinstance(posterior, distributions.Gaussian))
+      self.assertTrue(isinstance(posterior, distributions.Normal))
 
       # Calculate log_pdf under the 2 models
       posterior_log_pdf = posterior.log_pdf(x)
       self.assertEqual(posterior_log_pdf.get_shape(), (6, 2))
       self.assertEqual(posterior_log_pdf.eval().shape, (6, 2))
 
-  def testGaussianConjugateKnownSigmaPredictive(self):
+  def testNormalConjugateKnownSigmaPredictive(self):
     with tf.Session():
       batch_size = 6
       mu0 = tf.constant([3.0] * batch_size)
@@ -96,12 +96,12 @@ class GaussianTest(tf.test.TestCase):
       x = tf.constant([-2.5, 2.5, 4.0, 0.0, -1.0, 2.0])
       s = tf.reduce_sum(x)
       n = tf.size(x)
-      prior = distributions.Gaussian(mu=mu0, sigma=sigma0)
-      predictive = distributions.gaussian_congugates_known_sigma_predictive(
+      prior = distributions.Normal(mu=mu0, sigma=sigma0)
+      predictive = distributions.normal_congugates_known_sigma_predictive(
           prior=prior, sigma=sigma, s=s, n=n)
 
       # Smoke test
-      self.assertTrue(isinstance(predictive, distributions.Gaussian))
+      self.assertTrue(isinstance(predictive, distributions.Normal))
       predictive_log_pdf = predictive.log_pdf(x).eval()
       self.assertEqual(predictive_log_pdf.shape, (6,))
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/gaussian_test.py b/tensorflow/contrib/distributions/python/kernel_tests/normal_test.py
similarity index 79%
rename from tensorflow/contrib/distributions/python/kernel_tests/gaussian_test.py
rename to tensorflow/contrib/distributions/python/kernel_tests/normal_test.py
index f0a82df901c..0e9f8a40cca 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/gaussian_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/normal_test.py
@@ -24,9 +24,9 @@ import numpy as np
 import tensorflow as tf
 
 
-class GaussianTest(tf.test.TestCase):
+class NormalTest(tf.test.TestCase):
 
-  def testGaussianLogPDF(self):
+  def testNormalLogPDF(self):
     with tf.Session():
       batch_size = 6
       mu = tf.constant([3.0] * batch_size)
@@ -34,18 +34,18 @@ class GaussianTest(tf.test.TestCase):
       mu_v = 3.0
       sigma_v = np.sqrt(10.0)
       x = np.array([-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], dtype=np.float32)
-      gaussian = tf.contrib.distributions.Gaussian(mu=mu, sigma=sigma)
+      normal = tf.contrib.distributions.Normal(mu=mu, sigma=sigma)
       expected_log_pdf = np.log(
           1 / np.sqrt(2 * np.pi) / sigma_v
           * np.exp(-1.0 / (2 * sigma_v**2) * (x - mu_v)**2))
 
-      log_pdf = gaussian.log_pdf(x)
+      log_pdf = normal.log_pdf(x)
       self.assertAllClose(expected_log_pdf, log_pdf.eval())
 
-      pdf = gaussian.pdf(x)
+      pdf = normal.pdf(x)
       self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
 
-  def testGaussianLogPDFMultidimensional(self):
+  def testNormalLogPDFMultidimensional(self):
     with tf.Session():
       batch_size = 6
       mu = tf.constant([[3.0, -3.0]] * batch_size)
@@ -53,22 +53,22 @@ class GaussianTest(tf.test.TestCase):
       mu_v = np.array([3.0, -3.0])
       sigma_v = np.array([np.sqrt(10.0), np.sqrt(15.0)])
       x = np.array([[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0]], dtype=np.float32).T
-      gaussian = tf.contrib.distributions.Gaussian(mu=mu, sigma=sigma)
+      normal = tf.contrib.distributions.Normal(mu=mu, sigma=sigma)
       expected_log_pdf = np.log(
           1 / np.sqrt(2 * np.pi) / sigma_v
           * np.exp(-1.0 / (2 * sigma_v**2) * (x - mu_v)**2))
 
-      log_pdf = gaussian.log_pdf(x)
+      log_pdf = normal.log_pdf(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
       self.assertAllClose(expected_log_pdf, log_pdf_values)
 
-      pdf = gaussian.pdf(x)
+      pdf = normal.pdf(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
       self.assertAllClose(np.exp(expected_log_pdf), pdf_values)
 
-  def testGaussianCDF(self):
+  def testNormalCDF(self):
     with tf.Session():
       batch_size = 6
       mu = tf.constant([3.0] * batch_size)
@@ -77,40 +77,40 @@ class GaussianTest(tf.test.TestCase):
       sigma_v = np.sqrt(10.0)
       x = np.array([-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], dtype=np.float32)
 
-      gaussian = tf.contrib.distributions.Gaussian(mu=mu, sigma=sigma)
+      normal = tf.contrib.distributions.Normal(mu=mu, sigma=sigma)
       erf_fn = np.vectorize(math.erf)
 
       # From Wikipedia
       expected_cdf = 0.5 * (1.0 + erf_fn((x - mu_v)/(sigma_v*np.sqrt(2))))
 
-      cdf = gaussian.cdf(x)
+      cdf = normal.cdf(x)
       self.assertAllClose(expected_cdf, cdf.eval())
 
-  def testGaussianEntropy(self):
+  def testNormalEntropy(self):
     with tf.Session():
       mu_v = np.array([1.0, 1.0, 1.0])
       sigma_v = np.array([[1.0, 2.0, 3.0]]).T
-      gaussian = tf.contrib.distributions.Gaussian(mu=mu_v, sigma=sigma_v)
+      normal = tf.contrib.distributions.Normal(mu=mu_v, sigma=sigma_v)
 
       sigma_broadcast = mu_v * sigma_v
       expected_entropy = 0.5 * np.log(2*np.pi*np.exp(1)*sigma_broadcast**2)
-      self.assertAllClose(expected_entropy, gaussian.entropy().eval())
+      self.assertAllClose(expected_entropy, normal.entropy().eval())
 
-  def testGaussianSample(self):
+  def testNormalSample(self):
     with tf.Session():
       mu = tf.constant(3.0)
       sigma = tf.constant(math.sqrt(10.0))
       mu_v = 3.0
       sigma_v = np.sqrt(10.0)
       n = tf.constant(100000)
-      gaussian = tf.contrib.distributions.Gaussian(mu=mu, sigma=sigma)
-      samples = gaussian.sample(n, seed=137)
+      normal = tf.contrib.distributions.Normal(mu=mu, sigma=sigma)
+      samples = normal.sample(n, seed=137)
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000,))
       self.assertAllClose(sample_values.mean(), mu_v, atol=1e-2)
       self.assertAllClose(sample_values.std(), sigma_v, atol=1e-1)
 
-  def testGaussianSampleMultiDimensional(self):
+  def testNormalSampleMultiDimensional(self):
     with tf.Session():
       batch_size = 2
       mu = tf.constant([[3.0, -3.0]] * batch_size)
@@ -118,8 +118,8 @@ class GaussianTest(tf.test.TestCase):
       mu_v = [3.0, -3.0]
       sigma_v = [np.sqrt(10.0), np.sqrt(15.0)]
       n = tf.constant(100000)
-      gaussian = tf.contrib.distributions.Gaussian(mu=mu, sigma=sigma)
-      samples = gaussian.sample(n, seed=137)
+      normal = tf.contrib.distributions.Normal(mu=mu, sigma=sigma)
+      samples = normal.sample(n, seed=137)
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (100000, batch_size, 2))
       self.assertAllClose(sample_values[:, 0, 0].mean(), mu_v[0], atol=1e-2)
@@ -129,13 +129,13 @@ class GaussianTest(tf.test.TestCase):
 
   def testNegativeSigmaFails(self):
     with tf.Session():
-      gaussian = tf.contrib.distributions.Gaussian(
+      normal = tf.contrib.distributions.Normal(
           mu=[1.],
           sigma=[-5.],
           name='G')
       with self.assertRaisesOpError(
           r'should contain only positive values'):
-        gaussian.mean.eval()
+        normal.mean.eval()
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/exponential.py b/tensorflow/contrib/distributions/python/ops/exponential.py
index b80632fc496..4a93c210b91 100644
--- a/tensorflow/contrib/distributions/python/ops/exponential.py
+++ b/tensorflow/contrib/distributions/python/ops/exponential.py
@@ -70,6 +70,7 @@ class Exponential(gamma.Gamma):
     """
     broadcast_shape = self._lam.get_shape()
     with ops.op_scope([self.lam, n], name, "ExponentialSample"):
+      n = ops.convert_to_tensor(n, name="n")
       shape = array_ops.concat(
           0, [array_ops.pack([n]), array_ops.shape(self._lam)])
       sampled = random_ops.random_uniform(
diff --git a/tensorflow/contrib/distributions/python/ops/gaussian.py b/tensorflow/contrib/distributions/python/ops/normal.py
similarity index 82%
rename from tensorflow/contrib/distributions/python/ops/gaussian.py
rename to tensorflow/contrib/distributions/python/ops/normal.py
index 8e2049444af..dc08a0e1dec 100644
--- a/tensorflow/contrib/distributions/python/ops/gaussian.py
+++ b/tensorflow/contrib/distributions/python/ops/normal.py
@@ -38,8 +38,8 @@ def _assert_all_positive(x):
       ["Tensor %s should contain only positive values: " % x.name, x])
 
 
-class Gaussian(object):
-  """The scalar Gaussian distribution with mean and stddev parameters mu, sigma.
+class Normal(object):
+  """The scalar Normal distribution with mean and stddev parameters mu, sigma.
 
   #### Mathematical details
 
@@ -52,15 +52,15 @@ class Gaussian(object):
   Examples of initialization of one or a batch of distributions.
 
   ```python
-  # Define a single scalar Gaussian distribution.
-  dist = tf.contrib.distributions.Gaussian(mu=0, sigma=3)
+  # Define a single scalar Normal distribution.
+  dist = tf.contrib.distributions.Normal(mu=0, sigma=3)
 
   # Evaluate the cdf at 1, returning a scalar.
   dist.cdf(1)
 
-  # Define a batch of two scalar valued Gaussians.
+  # Define a batch of two scalar valued Normals.
   # The first has mean 1 and standard deviation 11, the second 2 and 22.
-  dist = tf.contrib.distributions.Gaussian(mu=[1, 2.], sigma=[11, 22.])
+  dist = tf.contrib.distributions.Normal(mu=[1, 2.], sigma=[11, 22.])
 
   # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
   # returning a length two tensor.
@@ -73,9 +73,9 @@ class Gaussian(object):
   Arguments are broadcast when possible.
 
   ```python
-  # Define a batch of two scalar valued Gaussians.
+  # Define a batch of two scalar valued Normals.
   # Both have mean 1, but different standard deviations.
-  dist = tf.contrib.distributions.Gaussian(mu=1, sigma=[11, 22.])
+  dist = tf.contrib.distributions.Normal(mu=1, sigma=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
@@ -85,7 +85,7 @@ class Gaussian(object):
   """
 
   def __init__(self, mu, sigma, name=None):
-    """Construct Gaussian distributions with mean and stddev `mu` and `sigma`.
+    """Construct Normal distributions with mean and stddev `mu` and `sigma`.
 
     The parameters `mu` and `sigma` must be shaped in a way that supports
     broadcasting (e.g. `mu + sigma` is a valid operation).
@@ -99,7 +99,7 @@ class Gaussian(object):
     Raises:
       TypeError: if mu and sigma are different dtypes.
     """
-    with ops.op_scope([mu, sigma], name, "Gaussian"):
+    with ops.op_scope([mu, sigma], name, "Normal"):
       mu = ops.convert_to_tensor(mu)
       sigma = ops.convert_to_tensor(sigma)
       with ops.control_dependencies([_assert_all_positive(sigma)]):
@@ -125,7 +125,7 @@ class Gaussian(object):
     return self._mu * array_ops.ones_like(self._sigma)
 
   def log_pdf(self, x, name=None):
-    """Log pdf of observations in `x` under these Gaussian distribution(s).
+    """Log pdf of observations in `x` under these Normal distribution(s).
 
     Args:
       x: tensor of dtype `dtype`, must be broadcastable with `mu` and `sigma`.
@@ -134,7 +134,7 @@ class Gaussian(object):
     Returns:
       log_pdf: tensor of dtype `dtype`, the log-PDFs of `x`.
     """
-    with ops.op_scope([self._mu, self._sigma, x], name, "GaussianLogPdf"):
+    with ops.op_scope([self._mu, self._sigma, x], name, "NormalLogPdf"):
       x = ops.convert_to_tensor(x)
       if x.dtype != self.dtype:
         raise TypeError("Input x dtype does not match dtype: %s vs. %s"
@@ -144,7 +144,7 @@ class Gaussian(object):
               -0.5*math_ops.square((x - self._mu) / self._sigma))
 
   def cdf(self, x, name=None):
-    """CDF of observations in `x` under these Gaussian distribution(s).
+    """CDF of observations in `x` under these Normal distribution(s).
 
     Args:
       x: tensor of dtype `dtype`, must be broadcastable with `mu` and `sigma`.
@@ -153,7 +153,7 @@ class Gaussian(object):
     Returns:
       cdf: tensor of dtype `dtype`, the CDFs of `x`.
     """
-    with ops.op_scope([self._mu, self._sigma, x], name, "GaussianCdf"):
+    with ops.op_scope([self._mu, self._sigma, x], name, "NormalCdf"):
       x = ops.convert_to_tensor(x)
       if x.dtype != self.dtype:
         raise TypeError("Input x dtype does not match dtype: %s vs. %s"
@@ -162,7 +162,7 @@ class Gaussian(object):
           1.0/(math.sqrt(2.0) * self._sigma)*(x - self._mu)))
 
   def log_cdf(self, x, name=None):
-    """Log CDF of observations `x` under these Gaussian distribution(s).
+    """Log CDF of observations `x` under these Normal distribution(s).
 
     Args:
       x: tensor of dtype `dtype`, must be broadcastable with `mu` and `sigma`.
@@ -171,11 +171,11 @@ class Gaussian(object):
     Returns:
       log_cdf: tensor of dtype `dtype`, the log-CDFs of `x`.
     """
-    with ops.op_scope([self._mu, self._sigma, x], name, "GaussianLogCdf"):
+    with ops.op_scope([self._mu, self._sigma, x], name, "NormalLogCdf"):
       return math_ops.log(self.cdf(x))
 
   def pdf(self, x, name=None):
-    """The PDF of observations in `x` under these Gaussian distribution(s).
+    """The PDF of observations in `x` under these Normal distribution(s).
 
     Args:
       x: tensor of dtype `dtype`, must be broadcastable with `mu` and `sigma`.
@@ -184,11 +184,11 @@ class Gaussian(object):
     Returns:
       pdf: tensor of dtype `dtype`, the pdf values of `x`.
     """
-    with ops.op_scope([self._mu, self._sigma, x], name, "GaussianPdf"):
+    with ops.op_scope([self._mu, self._sigma, x], name, "NormalPdf"):
       return math_ops.exp(self.log_pdf(x))
 
   def entropy(self, name=None):
-    """The entropy of Gaussian distribution(s).
+    """The entropy of Normal distribution(s).
 
     Args:
       name: The name to give this op.
@@ -196,7 +196,7 @@ class Gaussian(object):
     Returns:
       entropy: tensor of dtype `dtype`, the entropy.
     """
-    with ops.op_scope([self._mu, self._sigma], name, "GaussianEntropy"):
+    with ops.op_scope([self._mu, self._sigma], name, "NormalEntropy"):
       two_pi_e1 = constant_op.constant(
           2 * math.pi * math.exp(1), dtype=self.dtype)
       # Use broadcasting rules to calculate the full broadcast sigma.
@@ -204,7 +204,7 @@ class Gaussian(object):
       return 0.5 * math_ops.log(two_pi_e1 * math_ops.square(sigma))
 
   def sample(self, n, seed=None, name=None):
-    """Sample `n` observations from the Gaussian Distributions.
+    """Sample `n` observations from the Normal Distributions.
 
     Args:
       n: `Scalar`, type int32, the number of observations to sample.
@@ -215,7 +215,7 @@ class Gaussian(object):
       samples: `[n, ...]`, a `Tensor` of `n` samples for each
         of the distributions determined by broadcasting the hyperparameters.
     """
-    with ops.op_scope([self._mu, self._sigma, n], name, "GaussianSample"):
+    with ops.op_scope([self._mu, self._sigma, n], name, "NormalSample"):
       broadcast_shape = (self._mu + self._sigma).get_shape()
       n = ops.convert_to_tensor(n)
       shape = array_ops.concat(
diff --git a/tensorflow/contrib/distributions/python/ops/gaussian_conjugate_posteriors.py b/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
similarity index 73%
rename from tensorflow/contrib/distributions/python/ops/gaussian_conjugate_posteriors.py
rename to tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
index c0089964152..45ddd3ada36 100644
--- a/tensorflow/contrib/distributions/python/ops/gaussian_conjugate_posteriors.py
+++ b/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
@@ -12,32 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""The Gaussian distribution: conjugate posterior closed form calculations."""
+"""The Normal distribution: conjugate posterior closed form calculations."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.gaussian import Gaussian  # pylint: disable=line-too-long
+from tensorflow.contrib.distributions.python.ops.normal import Normal  # pylint: disable=line-too-long
 
 from tensorflow.python.ops import math_ops
 
 
-def gaussian_conjugates_known_sigma_posterior(prior, sigma, s, n):
-  """Posterior Gaussian distribution with conjugate prior on the mean.
+def normal_conjugates_known_sigma_posterior(prior, sigma, s, n):
+  """Posterior Normal distribution with conjugate prior on the mean.
 
   This model assumes that `n` observations (with sum `s`) come from a
-  Gaussian with unknown mean `mu` (described by the Gaussian `prior`)
+  Normal with unknown mean `mu` (described by the Normal `prior`)
   and known variance `sigma^2`.  The "known sigma posterior" is
   the distribution of the unknown `mu`.
 
-  Accepts a prior Gaussian distribution object, having parameters
+  Accepts a prior Normal distribution object, having parameters
   `mu0` and `sigma0`, as well as known `sigma` values of the predictive
-  distribution(s) (also assumed Gaussian),
+  distribution(s) (also assumed Normal),
   and statistical estimates `s` (the sum(s) of the observations) and
   `n` (the number(s) of observations).
 
-  Returns a posterior (also Gaussian) distribution object, with parameters
+  Returns a posterior (also Normal) distribution object, with parameters
   `(mu', sigma'^2)`, where:
 
   ```
@@ -50,7 +50,7 @@ def gaussian_conjugates_known_sigma_posterior(prior, sigma, s, n):
   will broadcast in the case of multidimensional sets of parameters.
 
   Args:
-    prior: `Gaussian` object of type `dtype`:
+    prior: `Normal` object of type `dtype`:
       the prior distribution having parameters `(mu0, sigma0)`.
     sigma: tensor of type `dtype`, taking values `sigma > 0`.
       The known stddev parameter(s).
@@ -58,15 +58,15 @@ def gaussian_conjugates_known_sigma_posterior(prior, sigma, s, n):
     n: Tensor of type `int`.  The number(s) of observations.
 
   Returns:
-    A new Gaussian posterior distribution object for the unknown observation
+    A new Normal posterior distribution object for the unknown observation
     mean `mu`.
 
   Raises:
     TypeError: if dtype of `s` does not match `dtype`, or `prior` is not a
-      Gaussian object.
+      Normal object.
   """
-  if not isinstance(prior, Gaussian):
-    raise TypeError("Expected prior to be an instance of type Gaussian")
+  if not isinstance(prior, Normal):
+    raise TypeError("Expected prior to be an instance of type Normal")
 
   if s.dtype != prior.dtype:
     raise TypeError(
@@ -77,27 +77,27 @@ def gaussian_conjugates_known_sigma_posterior(prior, sigma, s, n):
   sigma0_2 = math_ops.square(prior.sigma)
   sigma_2 = math_ops.square(sigma)
   sigmap_2 = 1.0/(1/sigma0_2 + n/sigma_2)
-  return Gaussian(
+  return Normal(
       mu=(prior.mu/sigma0_2 + s/sigma_2) * sigmap_2,
       sigma=math_ops.sqrt(sigmap_2))
 
 
-def gaussian_congugates_known_sigma_predictive(prior, sigma, s, n):
-  """Posterior predictive Gaussian distribution w. conjugate prior on the mean.
+def normal_congugates_known_sigma_predictive(prior, sigma, s, n):
+  """Posterior predictive Normal distribution w. conjugate prior on the mean.
 
   This model assumes that `n` observations (with sum `s`) come from a
-  Gaussian with unknown mean `mu` (described by the Gaussian `prior`)
+  Normal with unknown mean `mu` (described by the Normal `prior`)
   and known variance `sigma^2`.  The "known sigma predictive"
   is the distribution of new observations, conditioned on the existing
   observations and our prior.
 
-  Accepts a prior Gaussian distribution object, having parameters
+  Accepts a prior Normal distribution object, having parameters
   `mu0` and `sigma0`, as well as known `sigma` values of the predictive
-  distribution(s) (also assumed Gaussian),
+  distribution(s) (also assumed Normal),
   and statistical estimates `s` (the sum(s) of the observations) and
   `n` (the number(s) of observations).
 
-  Calculates the Gaussian distribution(s) `p(x | sigma^2)`:
+  Calculates the Normal distribution(s) `p(x | sigma^2)`:
 
   ```
     p(x | sigma^2) = int N(x | mu, sigma^2) N(mu | prior.mu, prior.sigma^2) dmu
@@ -117,7 +117,7 @@ def gaussian_congugates_known_sigma_predictive(prior, sigma, s, n):
   will broadcast in the case of multidimensional sets of parameters.
 
   Args:
-    prior: `Gaussian` object of type `dtype`:
+    prior: `Normal` object of type `dtype`:
       the prior distribution having parameters `(mu0, sigma0)`.
     sigma: tensor of type `dtype`, taking values `sigma > 0`.
       The known stddev parameter(s).
@@ -125,14 +125,14 @@ def gaussian_congugates_known_sigma_predictive(prior, sigma, s, n):
     n: Tensor of type `int`.  The number(s) of observations.
 
   Returns:
-    A new Gaussian predictive distribution object.
+    A new Normal predictive distribution object.
 
   Raises:
     TypeError: if dtype of `s` does not match `dtype`, or `prior` is not a
-      Gaussian object.
+      Normal object.
   """
-  if not isinstance(prior, Gaussian):
-    raise TypeError("Expected prior to be an instance of type Gaussian")
+  if not isinstance(prior, Normal):
+    raise TypeError("Expected prior to be an instance of type Normal")
 
   if s.dtype != prior.dtype:
     raise TypeError(
@@ -143,6 +143,6 @@ def gaussian_congugates_known_sigma_predictive(prior, sigma, s, n):
   sigma0_2 = math_ops.square(prior.sigma)
   sigma_2 = math_ops.square(sigma)
   sigmap_2 = 1.0/(1/sigma0_2 + n/sigma_2)
-  return Gaussian(
+  return Normal(
       mu=(prior.mu/sigma0_2 + s/sigma_2) * sigmap_2,
       sigma=math_ops.sqrt(sigmap_2 + sigma_2))
diff --git a/tensorflow/contrib/ffmpeg/BUILD b/tensorflow/contrib/ffmpeg/BUILD
index 75d58ccf23b..268d7bea369 100644
--- a/tensorflow/contrib/ffmpeg/BUILD
+++ b/tensorflow/contrib/ffmpeg/BUILD
@@ -17,6 +17,8 @@ filegroup(
     srcs = glob(["testdata/*"]),
 )
 
+exports_files(["ffmpeg_lib.h"])
+
 cc_library(
     name = "decode_audio_op_cc",
     srcs = ["decode_audio_op.cc"],
diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op.cc b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
index b38b9957a84..a2ecc7f287e 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
@@ -18,7 +18,7 @@
 #include <cstdio>
 #include <set>
 
-#include "tensorflow/contrib/ffmpeg/default/ffmpeg_lib.h"
+#include "tensorflow/contrib/ffmpeg/ffmpeg_lib.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/io/path.h"
diff --git a/tensorflow/contrib/ffmpeg/default/BUILD b/tensorflow/contrib/ffmpeg/default/BUILD
index f8566df6730..e1b7bb61924 100644
--- a/tensorflow/contrib/ffmpeg/default/BUILD
+++ b/tensorflow/contrib/ffmpeg/default/BUILD
@@ -11,7 +11,10 @@ package(default_visibility = ["//tensorflow:__subpackages__"])
 cc_library(
     name = "ffmpeg_lib",
     srcs = ["ffmpeg_lib.cc"],
-    hdrs = ["ffmpeg_lib.h"],
+    hdrs = [
+        # Header is shared between implementations.
+        "//tensorflow/contrib/ffmpeg:ffmpeg_lib.h",
+    ],
     deps = [
         "//google/protobuf",
         "//tensorflow/core:framework_headers_lib",
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index 629072ed7e1..8a7b6840f67 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 // =============================================================================
 
-#include "tensorflow/contrib/ffmpeg/default/ffmpeg_lib.h"
+#include "tensorflow/contrib/ffmpeg/ffmpeg_lib.h"
 
 #include <errno.h>
 #include <stdlib.h>
@@ -212,9 +212,9 @@ Status ReadAudioFile(const string& filename,
   }
 }
 
-Status CreateAudioFile(const string& audio_format_id, int32 samples_per_second,
-                       int32 channel_count, const std::vector<float>& samples,
-                       string* output_data) {
+Status CreateAudioFile(const string& audio_format_id, int32 bits_per_second,
+                       int32 samples_per_second, int32 channel_count,
+                       const std::vector<float>& samples, string* output_data) {
   if (audio_format_id != "wav") {
     return Status(error::Code::INVALID_ARGUMENT,
                   "CreateAudioFile only supports the 'wav' audio format.");
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
index 9001341e641..ec0b19f961a 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 // =============================================================================
 
-#include "tensorflow/contrib/ffmpeg/default/ffmpeg_lib.h"
+#include "tensorflow/contrib/ffmpeg/ffmpeg_lib.h"
 
 #include <stdlib.h>
 #include <vector>
@@ -91,7 +91,7 @@ TEST(FfmpegLibTest, TestRoundTripGeneratedWav) {
     sine_wave.push_back(std::sin(6.28 * 440.0 * i / 20000.0));
   }
   string content;
-  ASSERT_TRUE(CreateAudioFile("wav", 20000, 1, sine_wave, &content).ok());
+  ASSERT_TRUE(CreateAudioFile("wav", 0, 20000, 1, sine_wave, &content).ok());
   string temp_filename = GetTempFilename("wav");
   ASSERT_TRUE(WriteStringToFile(Env::Default(), temp_filename, content).ok());
   std::vector<float> roundtrip_data;
@@ -122,7 +122,7 @@ TEST(FfmpegLibTest, TestRoundTripWav) {
 
   string written_audio;
   ASSERT_TRUE(
-      CreateAudioFile("wav", 10000, 1, output_samples, &written_audio).ok());
+      CreateAudioFile("wav", 0, 10000, 1, output_samples, &written_audio).ok());
 
   EXPECT_EQ(original_audio, written_audio);
 }
diff --git a/tensorflow/contrib/ffmpeg/encode_audio_op.cc b/tensorflow/contrib/ffmpeg/encode_audio_op.cc
index 0997c0458db..46fcbc75d74 100644
--- a/tensorflow/contrib/ffmpeg/encode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/encode_audio_op.cc
@@ -15,7 +15,7 @@
 
 #include <limits>
 
-#include "tensorflow/contrib/ffmpeg/default/ffmpeg_lib.h"
+#include "tensorflow/contrib/ffmpeg/ffmpeg_lib.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -35,6 +35,8 @@ class EncodeAudioOp : public OpKernel {
         context, context->GetAttr("samples_per_second", &samples_per_second_));
     OP_REQUIRES(context, samples_per_second_ > 0,
                 errors::InvalidArgument("samples_per_second must be > 0."));
+    OP_REQUIRES_OK(
+        context, context->GetAttr("bits_per_second", &bits_per_second_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -61,9 +63,9 @@ class EncodeAudioOp : public OpKernel {
     }
     const int32 channel_count = contents.dim_size(1);
     string encoded_audio;
-    OP_REQUIRES_OK(context,
-                   CreateAudioFile(file_format_, samples_per_second_,
-                                   channel_count, samples, &encoded_audio));
+    OP_REQUIRES_OK(context, CreateAudioFile(file_format_, bits_per_second_,
+                                            samples_per_second_, channel_count,
+                                            samples, &encoded_audio));
 
     // Copy the encoded audio file to the output tensor.
     Tensor* output = nullptr;
@@ -75,6 +77,7 @@ class EncodeAudioOp : public OpKernel {
  private:
   string file_format_;
   int32 samples_per_second_;
+  int32 bits_per_second_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("EncodeAudio").Device(DEVICE_CPU), EncodeAudioOp);
@@ -84,6 +87,7 @@ REGISTER_OP("EncodeAudio")
     .Output("contents: string")
     .Attr("file_format: string")
     .Attr("samples_per_second: int")
+    .Attr("bits_per_second: int = 192000")
     .Doc(R"doc(
 Processes a `Tensor` containing sampled audio with the number of channels
 and length of the audio specified by the dimensions of the `Tensor`. The
@@ -100,6 +104,8 @@ sampled_audio: A rank 2 tensor containing all tracks of the audio. Dimension 0
 contents: The binary audio file contents.
 file_format: A string describing the audio file format. This must be "wav".
 samples_per_second: The number of samples per second that the audio should have.
+bits_per_second: The approximate bitrate of the encoded audio file. This is
+    ignored by the "wav" file format.
 )doc");
 
 }  // namespace ffmpeg
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.h b/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
similarity index 83%
rename from tensorflow/contrib/ffmpeg/default/ffmpeg_lib.h
rename to tensorflow/contrib/ffmpeg/ffmpeg_lib.h
index d7b8f957de5..46b42c14334 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.h
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
@@ -13,10 +13,11 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_FFMPEG_DEFAULT_FFMPEG_LIB_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_FFMPEG_DEFAULT_FFMPEG_LIB_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_FFMPEG_FFMPEG_LIB_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_FFMPEG_FFMPEG_LIB_H_
 
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/lib/core/status.h"
 
@@ -40,9 +41,9 @@ Status ReadAudioFile(const string& filename,
 // contain a separate sample for each channel. Frames are ordered by time.
 // Currently, the implementation only supports wav files, and ffmpeg is not used
 // to create them.
-Status CreateAudioFile(const string& audio_format_id, int32 samples_per_second,
-                       int32 channel_count, const std::vector<float>& samples,
-                       string* output_data);
+Status CreateAudioFile(const string& audio_format_id, int32 bits_per_second,
+                       int32 samples_per_second, int32 channel_count,
+                       const std::vector<float>& samples, string* output_data);
 
 }  // namespace ffmpeg
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 261103a746f..de447847f21 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -39,6 +39,7 @@ from tensorflow.python.training import moving_averages
 # TODO(b/28426988): Remove legacy_* when all uses have migrated to new API.
 __all__ = ['bias_add',
            'batch_norm',
+           'conv2d',
            'convolution2d',
            'fully_connected',
            'linear',
@@ -113,7 +114,7 @@ def batch_norm(inputs,
                scale=False,
                epsilon=0.001,
                activation_fn=None,
-               updates_collection=None,
+               updates_collections=ops.GraphKeys.UPDATE_OPS,
                is_training=True,
                reuse=None,
                variables_collections=None,
@@ -138,8 +139,9 @@ def batch_norm(inputs,
       disabled since the scaling can be done by the next layer.
     epsilon: small float added to variance to avoid dividing by zero.
     activation_fn: Optional activation function.
-    updates_collection: collection to collect the update ops for computation. If
-      None a control dependency would be added to make sure they are computed.
+    updates_collections: collections to collect the update ops for computation.
+      If None, a control dependency would be added to make sure the updates are
+      computed.
     is_training: whether or not the layer is in training mode. In training mode
       it would accumulate the statistics of the moments into `moving_mean` and
       `moving_variance` using an exponential moving average with the given
@@ -207,7 +209,7 @@ def batch_norm(inputs,
           moving_mean, mean, decay)
       update_moving_variance = moving_averages.assign_moving_average(
           moving_variance, variance, decay)
-      if updates_collection is None:
+      if updates_collections is None:
         # Make sure the updates are computed here.
         with ops.control_dependencies([update_moving_mean,
                                        update_moving_variance]):
@@ -215,8 +217,8 @@ def batch_norm(inputs,
               inputs, mean, variance, beta, gamma, epsilon)
       else:
         # Collect the updates to be computed later.
-        ops.add_to_collection(updates_collection, update_moving_mean)
-        ops.add_to_collection(updates_collection, update_moving_variance)
+        ops.add_to_collections(updates_collections, update_moving_mean)
+        ops.add_to_collections(updates_collections, update_moving_variance)
         outputs = nn.batch_normalization(
             inputs, mean, variance, beta, gamma, epsilon)
     else:
@@ -504,22 +506,6 @@ def legacy_fully_connected(x,
   Raises:
     ValueError: if x has rank less than 2 or if its last dimension is not set.
   """
-  # pylint: enable=anomalous-backslash-in-string
-# TODO(ptucker) redirect to fully_connected
-#   _ = trainable
-#   variables_collections = {'weights': weight_collections,
-#                            'biases': bias_collections}
-#   outputs = fully_connected(inputs=x,
-#                             num_outputs=num_output_units,
-#                             activation_fn=activation_fn,
-#                             weights_initializer=weight_init,
-#                             weights_regularizer=weight_regularizer,
-#                             biases_initializer=bias_init,
-#                             biases_regularizer=bias_regularizer,
-#                             variables_collections=variables_collections,
-#                             scope=name)
-#   ops.add_to_collections(output_collections, outputs)
-#   return outputs
   with variable_scope.variable_op_scope([x], name, 'fully_connected'):
     dims = x.get_shape().dims
     if dims is None:
@@ -645,24 +631,6 @@ def legacy_convolution2d(x,
   Raises:
     ValueError: If `kernel_size` or `stride` are not length 2.
   """
-# TODO(ptucker) redirect to convolution2d
-#   _ = trainable
-#   variables_collections = {'weights': weight_collections,
-#                            'biases': bias_collections}
-#   outputs = convolution2d(inputs=x,
-#                           num_outputs=num_output_channels,
-#                           kernel_size=kernel_size,
-#                           stride=stride,
-#                           padding=padding,
-#                           activation_fn=activation_fn,
-#                           weights_initializer=weight_init,
-#                           weights_regularizer=weight_regularizer,
-#                           biases_initializer=bias_init,
-#                           biases_regularizer=bias_regularizer,
-#                           variables_collections=variables_collections,
-#                           scope=name)
-#   ops.add_to_collections(output_collections, outputs)
-#   return outputs
   with variable_scope.variable_op_scope([x], name, 'convolution2d'):
     num_input_channels = x.get_shape().dims[3].value
 
@@ -714,3 +682,6 @@ linear = legacy_linear
 relu = legacy_relu
 relu6 = legacy_relu6
 
+# Simple alias for convolution2d.
+conv2d = convolution2d
+
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index de073e573eb..0c3be3c98f7 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -430,8 +430,8 @@ class BatchNormTest(tf.test.TestCase):
     height, width = 3, 3
     with self.test_session():
       images = tf.random_uniform((5, height, width, 3), seed=1)
-      tf.contrib.layers.batch_norm(images, updates_collection='update_ops')
-      update_layers = tf.get_collection('update_ops')
+      tf.contrib.layers.batch_norm(images, updates_collections='my_update_ops')
+      update_layers = tf.get_collection('my_update_ops')
       update_moving_mean = update_layers[0]
       update_moving_variance = update_layers[1]
       self.assertEquals(update_moving_mean.op.name,
@@ -460,7 +460,7 @@ class BatchNormTest(tf.test.TestCase):
     with self.test_session():
       images = tf.random_uniform((5, height, width, 3), seed=1)
       with tf.contrib.framework.arg_scope([tf.contrib.layers.batch_norm],
-                                          updates_collection='update_ops'):
+                                          updates_collections='update_ops'):
         tf.contrib.layers.batch_norm(images, scope='bn')
         self.assertEquals(len(tf.get_collection('update_ops')), 2)
         tf.contrib.layers.batch_norm(images, scope='bn', reuse=True)
@@ -479,7 +479,7 @@ class BatchNormTest(tf.test.TestCase):
       self.assertEquals(len(moving_variance), 1)
       self.assertEquals(moving_variance[0].op.name, 'BatchNorm/moving_variance')
 
-  def testUpdateMovingVars(self):
+  def testForceUpdateMovingVars(self):
     height, width = 3, 3
     with self.test_session() as sess:
       image_shape = (10, height, width, 3)
@@ -487,7 +487,8 @@ class BatchNormTest(tf.test.TestCase):
       expected_mean = np.mean(image_values, axis=(0, 1, 2))
       expected_var = np.var(image_values, axis=(0, 1, 2))
       images = tf.constant(image_values, shape=image_shape, dtype=tf.float32)
-      output = tf.contrib.layers.batch_norm(images, decay=0.1)
+      output = tf.contrib.layers.batch_norm(images, decay=0.1,
+                                            updates_collections=None)
       # Initialize all variables
       sess.run(tf.initialize_all_variables())
       moving_mean = tf.contrib.framework.get_variables(
@@ -515,9 +516,8 @@ class BatchNormTest(tf.test.TestCase):
       expected_mean = np.mean(image_values, axis=(0, 1, 2))
       expected_var = np.var(image_values, axis=(0, 1, 2))
       images = tf.constant(image_values, shape=image_shape, dtype=tf.float32)
-      output = tf.contrib.layers.batch_norm(images, decay=0.1,
-                                            updates_collection='update_ops')
-      update_ops = tf.get_collection('update_ops')
+      output = tf.contrib.layers.batch_norm(images, decay=0.1)
+      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
       with tf.control_dependencies(update_ops):
         barrier = tf.no_op(name='barrier')
       output = control_flow_ops.with_dependencies([barrier], output)
@@ -550,10 +550,9 @@ class BatchNormTest(tf.test.TestCase):
       images = tf.constant(image_values, shape=image_shape, dtype=tf.float32)
       output = tf.contrib.layers.batch_norm(images,
                                             decay=0.1,
-                                            is_training=False,
-                                            updates_collection='update_ops')
-      update_layers = tf.get_collection('update_ops')
-      self.assertEquals(update_layers, [])
+                                            is_training=False)
+      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+      self.assertEquals(update_ops, [])
       # Initialize all variables
       sess.run(tf.initialize_all_variables())
       moving_mean = tf.contrib.framework.get_variables(
@@ -587,10 +586,9 @@ class BatchNormTest(tf.test.TestCase):
       images = tf.constant(image_values, shape=image_shape, dtype=tf.float32)
       output = tf.contrib.layers.batch_norm(images,
                                             decay=0.1,
-                                            is_training=False,
-                                            updates_collection='update_ops')
-      update_layers = tf.get_collection('update_ops')
-      self.assertEquals(update_layers, [])
+                                            is_training=False)
+      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+      self.assertEquals(update_ops, [])
       # Initialize all variables
       sess.run(tf.initialize_all_variables())
       moving_mean = tf.contrib.framework.get_variables(
diff --git a/tensorflow/contrib/learn/python/learn/__init__.py b/tensorflow/contrib/learn/python/learn/__init__.py
index 8de7797e6b7..1d72243f992 100644
--- a/tensorflow/contrib/learn/python/learn/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/__init__.py
@@ -1,5 +1,4 @@
-"""Main Scikit Flow module."""
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -13,6 +12,8 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+"""High level API for learning with TensorFlow."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/contrib/learn/python/learn/datasets/base.py b/tensorflow/contrib/learn/python/learn/datasets/base.py
index 7f78b2dced9..9c29b9eeb11 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/base.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base.py
@@ -1,5 +1,4 @@
-"""Base utilities for loading datasets."""
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -13,6 +12,8 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+"""Base utilities for loading datasets."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/contrib/learn/python/learn/estimators/__init__.py b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
index e714c15f2e0..1b0d0aef6f5 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
@@ -1,5 +1,4 @@
-"""Scikit Flow Estimators."""
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,12 +11,16 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+
+"""Estimators."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.learn.python.learn.estimators.autoencoder import TensorFlowDNNAutoencoder
-from tensorflow.contrib.learn.python.learn.estimators.base import TensorFlowEstimator, TensorFlowBaseTransformer
+from tensorflow.contrib.learn.python.learn.estimators.base import TensorFlowBaseTransformer
+from tensorflow.contrib.learn.python.learn.estimators.base import TensorFlowEstimator
 from tensorflow.contrib.learn.python.learn.estimators.dnn import DNNClassifier
 from tensorflow.contrib.learn.python.learn.estimators.dnn import DNNRegressor
 from tensorflow.contrib.learn.python.learn.estimators.dnn import TensorFlowDNNClassifier
diff --git a/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py b/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py
index dcd1d81056b..5032ea966d4 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py
@@ -1,5 +1,4 @@
-"""sklearn cross-support."""
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,6 +11,9 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+
+"""sklearn cross-support."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -20,6 +22,8 @@ import collections
 import os
 
 import numpy as np
+import six
+
 
 def _pprint(d):
   return ', '.join(['%s=%s' % (key, str(value)) for key, value in d.items()])
@@ -102,6 +106,7 @@ class _BaseEstimator(object):
                        _pprint(self.get_params(deep=False)),)
 
 
+# pylint: disable=old-style-class
 class _ClassifierMixin():
   """Mixin class for all classifiers."""
   pass
@@ -111,8 +116,10 @@ class _RegressorMixin():
   """Mixin class for all regression estimators."""
   pass
 
+
 class _TransformerMixin():
-    """Mixin class for all transformer estimators."""
+  """Mixin class for all transformer estimators."""
+
 
 class _NotFittedError(ValueError, AttributeError):
   """Exception class to raise if estimator is used before fitting.
@@ -134,6 +141,8 @@ class _NotFittedError(ValueError, AttributeError):
   https://github.com/scikit-learn/scikit-learn/master/sklearn/exceptions.py
   """
 
+# pylint: enable=old-style-class
+
 
 def _accuracy_score(y_true, y_pred):
   score = y_true == y_pred
@@ -149,8 +158,7 @@ def _mean_squared_error(y_true, y_pred):
 
 
 def _train_test_split(*args, **options):
-  n_array = len(args)
-
+  # pylint: disable=missing-docstring
   test_size = options.pop('test_size', None)
   train_size = options.pop('train_size', None)
   random_state = options.pop('random_state', None)
@@ -159,7 +167,7 @@ def _train_test_split(*args, **options):
     train_size = 0.75
   elif train_size is None:
     train_size = 1 - test_size
-  train_size = train_size * args[0].shape[0]
+  train_size *= args[0].shape[0]
 
   np.random.seed(random_state)
   indices = np.random.permutation(args[0].shape[0])
@@ -173,6 +181,7 @@ def _train_test_split(*args, **options):
 # If "TENSORFLOW_SKLEARN" flag is defined then try to import from sklearn.
 TRY_IMPORT_SKLEARN = os.environ.get('TENSORFLOW_SKLEARN', False)
 if TRY_IMPORT_SKLEARN:
+  # pylint: disable=g-import-not-at-top,g-multiple-import,unused-import
   from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin
   from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
   from sklearn.cross_validation import train_test_split
diff --git a/tensorflow/contrib/learn/python/learn/estimators/autoencoder.py b/tensorflow/contrib/learn/python/learn/estimators/autoencoder.py
index 690bac8f196..a3f41697680 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/autoencoder.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/autoencoder.py
@@ -1,5 +1,4 @@
-"""Deep Autoencoder estimators."""
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,105 +11,115 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+
+"""Deep Autoencoder estimators."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.ops import nn
-from tensorflow.contrib.learn.python.learn.estimators.base import TensorFlowBaseTransformer
+import numpy as np
+
 from tensorflow.contrib.learn.python.learn import models
+from tensorflow.contrib.learn.python.learn.estimators.base import TensorFlowBaseTransformer
+from tensorflow.python.ops import nn
 
 
 class TensorFlowDNNAutoencoder(TensorFlowBaseTransformer):
-    """TensorFlow Autoencoder Regressor model.
+  """TensorFlow Autoencoder Regressor model.
 
-    Parameters:
-        hidden_units: List of hidden units per layer.
-        batch_size: Mini batch size.
-        activation: activation function used to map inner latent layer onto
-                    reconstruction layer.
-        add_noise: a function that adds noise to tensor_in, 
-               e.g. def add_noise(x):
-                        return(x + np.random.normal(0, 0.1, (len(x), len(x[0]))))
-        steps: Number of steps to run over data.
-        optimizer: Optimizer name (or class), for example "SGD", "Adam",
-                   "Adagrad".
-        learning_rate: If this is constant float value, no decay function is used.
-            Instead, a customized decay function can be passed that accepts
-            global_step as parameter and returns a Tensor.
-            e.g. exponential decay function:
-            def exp_decay(global_step):
-                return tf.train.exponential_decay(
-                    learning_rate=0.1, global_step,
-                    decay_steps=2, decay_rate=0.001)
-        continue_training: when continue_training is True, once initialized
-            model will be continuely trained on every call of fit.
-        config: RunConfig object that controls the configurations of the session,
-            e.g. num_cores, gpu_memory_fraction, etc.
-        verbose: Controls the verbosity, possible values:
-                 0: the algorithm and debug information is muted.
-                 1: trainer prints the progress.
-                 2: log device placement is printed.
-        dropout: When not None, the probability we will drop out a given
-                 coordinate.
-    """
-    def __init__(self, hidden_units, n_classes=0, batch_size=32,
-                 steps=200, optimizer="Adagrad", learning_rate=0.1,
-                 clip_gradients=5.0, activation=nn.relu, add_noise=None,
-                 continue_training=False, config=None,
-                 verbose=1, dropout=None):
-        self.hidden_units = hidden_units
-        self.dropout = dropout
-        self.activation = activation
-        self.add_noise = add_noise
-        super(TensorFlowDNNAutoencoder, self).__init__(
-            model_fn=self._model_fn,
-            n_classes=n_classes,
-            batch_size=batch_size, steps=steps, optimizer=optimizer,
-            learning_rate=learning_rate, clip_gradients=clip_gradients,
-            continue_training=continue_training,
-            config=config, verbose=verbose)
+  Parameters:
+      hidden_units: List of hidden units per layer.
+      batch_size: Mini batch size.
+      activation: activation function used to map inner latent layer onto
+                  reconstruction layer.
+      add_noise: a function that adds noise to tensor_in,
+             e.g. def add_noise(x):
+                      return(x + np.random.normal(0, 0.1, (len(x), len(x[0]))))
+      steps: Number of steps to run over data.
+      optimizer: Optimizer name (or class), for example "SGD", "Adam",
+                 "Adagrad".
+      learning_rate: If this is constant float value, no decay function is used.
+          Instead, a customized decay function can be passed that accepts
+          global_step as parameter and returns a Tensor.
+          e.g. exponential decay function:
+          def exp_decay(global_step):
+              return tf.train.exponential_decay(
+                  learning_rate=0.1, global_step,
+                  decay_steps=2, decay_rate=0.001)
+      continue_training: when continue_training is True, once initialized
+          model will be continuely trained on every call of fit.
+      config: RunConfig object that controls the configurations of the session,
+          e.g. num_cores, gpu_memory_fraction, etc.
+      verbose: Controls the verbosity, possible values:
+               0: the algorithm and debug information is muted.
+               1: trainer prints the progress.
+               2: log device placement is printed.
+      dropout: When not None, the probability we will drop out a given
+               coordinate.
+  """
 
-    def _model_fn(self, X, y):
-        encoder, decoder, autoencoder_estimator = models.get_autoencoder_model(
-            self.hidden_units,
-            models.linear_regression,
-            activation=self.activation,
-            add_noise=self.add_noise,
-            dropout=self.dropout)(X)
-        self.encoder = encoder
-        self.decoder = decoder
-        return autoencoder_estimator
+  def __init__(self, hidden_units, n_classes=0, batch_size=32,
+               steps=200, optimizer="Adagrad", learning_rate=0.1,
+               clip_gradients=5.0, activation=nn.relu, add_noise=None,
+               continue_training=False, config=None,
+               verbose=1, dropout=None):
+    self.hidden_units = hidden_units
+    self.dropout = dropout
+    self.activation = activation
+    self.add_noise = add_noise
+    super(TensorFlowDNNAutoencoder, self).__init__(
+        model_fn=self._model_fn,
+        n_classes=n_classes,
+        batch_size=batch_size, steps=steps, optimizer=optimizer,
+        learning_rate=learning_rate, clip_gradients=clip_gradients,
+        continue_training=continue_training,
+        config=config, verbose=verbose)
 
-    def generate(self, hidden=None):
-        """Generate new data using trained construction layer"""
-        if hidden is None:
-            last_layer = len(self.hidden_units) - 1
-            bias = self.get_tensor_value('encoder/dnn/layer%d/Linear/Bias:0' % last_layer)
-            import numpy as np
-            hidden = np.random.normal(size=bias.shape)
-            hidden = np.reshape(hidden, (1, len(hidden)))
-        return self._session.run(self.decoder, feed_dict={self.encoder: hidden})
+  def _model_fn(self, X, y):
+    encoder, decoder, autoencoder_estimator = models.get_autoencoder_model(
+        self.hidden_units,
+        models.linear_regression,
+        activation=self.activation,
+        add_noise=self.add_noise,
+        dropout=self.dropout)(X)
+    self.encoder = encoder
+    self.decoder = decoder
+    return autoencoder_estimator
 
-    @property
-    def weights_(self):
-        """Returns weights of the autoencoder's weight layers."""
-        weights = []
-        for layer in range(len(self.hidden_units)):
-            weights.append(self.get_tensor_value('encoder/dnn/layer%d/Linear/Matrix:0' % layer))
-        for layer in range(len(self.hidden_units)):
-            weights.append(self.get_tensor_value('decoder/dnn/layer%d/Linear/Matrix:0' % layer))
-        weights.append(self.get_tensor_value('linear_regression/weights:0'))
-        return weights
+  def generate(self, hidden=None):
+    """Generate new data using trained construction layer."""
+    if hidden is None:
+      last_layer = len(self.hidden_units) - 1
+      bias = self.get_tensor_value(
+          "encoder/dnn/layer%d/Linear/Bias:0" % last_layer)
+      hidden = np.random.normal(size=bias.shape)
+      hidden = np.reshape(hidden, (1, len(hidden)))
+    return self._session.run(self.decoder, feed_dict={self.encoder: hidden})
 
-    @property
-    def bias_(self):
-        """Returns bias of the autoencoder's bias layers."""
-        biases = []
-        for layer in range(len(self.hidden_units)):
-            biases.append(self.get_tensor_value('encoder/dnn/layer%d/Linear/Bias:0' % layer))
-        for layer in range(len(self.hidden_units)):
-            biases.append(self.get_tensor_value('decoder/dnn/layer%d/Linear/Bias:0' % layer))
-        biases.append(self.get_tensor_value('linear_regression/bias:0'))
-        return biases
+  @property
+  def weights_(self):
+    """Returns weights of the autoencoder's weight layers."""
+    weights = []
+    for layer in range(len(self.hidden_units)):
+      weights.append(self.get_tensor_value(
+          "encoder/dnn/layer%d/Linear/Matrix:0" % layer))
+    for layer in range(len(self.hidden_units)):
+      weights.append(self.get_tensor_value(
+          "decoder/dnn/layer%d/Linear/Matrix:0" % layer))
+    weights.append(self.get_tensor_value("linear_regression/weights:0"))
+    return weights
+
+  @property
+  def bias_(self):
+    """Returns bias of the autoencoder's bias layers."""
+    biases = []
+    for layer in range(len(self.hidden_units)):
+      biases.append(self.get_tensor_value(
+          "encoder/dnn/layer%d/Linear/Bias:0" % layer))
+    for layer in range(len(self.hidden_units)):
+      biases.append(self.get_tensor_value(
+          "decoder/dnn/layer%d/Linear/Bias:0" % layer))
+    biases.append(self.get_tensor_value("linear_regression/bias:0"))
+    return biases
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/base.py b/tensorflow/contrib/learn/python/learn/estimators/base.py
index 39131f059b0..ab00ae76f78 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/base.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/base.py
@@ -1,5 +1,4 @@
-"""Base estimator class."""
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,18 +11,17 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+
+"""Base estimator class."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import datetime
 import json
 import os
-import shutil
 from six import string_types
 
-import numpy as np
-
 from google.protobuf import text_format
 from tensorflow.python.platform import gfile
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index 017667699bc..5447d9ec052 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -1,5 +1,4 @@
-"""Deep Neural Network estimators."""
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,6 +11,9 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+
+"""Deep Neural Network estimators."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 0fce7d140f1..1f476e13937 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -563,9 +563,13 @@ class Estimator(BaseEstimator):
                                     input_fn=input_fn,
                                     batch_size=batch_size)
     if self._classification:
-      for key in predictions:
-        cur_axis = (len(predictions[key].shape) - 1) if axis is None else axis
-        predictions[key] = np.argmax(predictions[key], axis=cur_axis)
+      if isinstance(predictions, dict):
+        for key in predictions:
+          cur_axis = (len(predictions[key].shape) - 1) if axis is None else axis
+          predictions[key] = np.argmax(predictions[key], axis=cur_axis)
+      else:
+        cur_axis = (len(predictions.shape) - 1) if axis is None else axis
+        predictions = np.argmax(predictions, axis=cur_axis)
     return predictions
 
   def predict_proba(self, x=None, input_fn=None, batch_size=None):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index 40a455c6bf1..b45cf8af168 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -36,6 +36,17 @@ def boston_input_fn():
   return features, target
 
 
+def iris_input_fn():
+  iris = tf.contrib.learn.datasets.load_iris()
+  features = tf.cast(
+      tf.reshape(
+          tf.constant(iris.data), [-1, 4]), tf.float32)
+  target = tf.cast(
+      tf.reshape(
+          tf.constant(iris.target), [-1, 1]), tf.int32)
+  return features, target
+
+
 def boston_eval_fn():
   boston = tf.contrib.learn.datasets.load_boston()
   n_examples = len(boston.target)
@@ -52,6 +63,10 @@ def linear_model_fn(features, target, unused_mode):
   return tf.contrib.learn.models.linear_regression_zero_init(features, target)
 
 
+def logistic_model_fn(features, target, unused_mode):
+  return tf.contrib.learn.models.logistic_regression_zero_init(features, target)
+
+
 class CheckCallsMonitor(tf.contrib.learn.monitors.BaseMonitor):
 
   def __init__(self):
@@ -84,6 +99,15 @@ class EstimatorTest(tf.test.TestCase):
     other_score = mean_squared_error(predictions, boston.target)
     self.assertAllClose(other_score, scores['mean_squared_error'])
 
+  def testIrisAll(self):
+    iris = tf.contrib.learn.datasets.load_iris()
+    est = tf.contrib.learn.Estimator(model_fn=logistic_model_fn,
+                                     classification=True)
+    est.train(input_fn=iris_input_fn, steps=100)
+    _ = est.evaluate(input_fn=iris_input_fn, steps=1)
+    predictions = est.predict(x=iris.data)
+    self.assertEqual(predictions.shape[0], iris.target.shape[0])
+
   def testTrainInputFn(self):
     est = tf.contrib.learn.Estimator(model_fn=linear_model_fn,
                                      classification=False)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index d58ab35f5ee..ef73c44013a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -1,5 +1,4 @@
-"""Linear Estimators."""
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,6 +11,9 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+
+"""Linear Estimators."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/contrib/learn/python/learn/estimators/rnn.py b/tensorflow/contrib/learn/python/learn/estimators/rnn.py
index b703f607657..719a19a5bc8 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/rnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/rnn.py
@@ -1,5 +1,4 @@
-"""Recurrent Neural Network estimators."""
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,6 +11,9 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+
+"""Recurrent Neural Network estimators."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/contrib/learn/python/learn/io/data_feeder.py b/tensorflow/contrib/learn/python/learn/io/data_feeder.py
index 04bbd997482..b3ed3bc7d92 100644
--- a/tensorflow/contrib/learn/python/learn/io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/io/data_feeder.py
@@ -1,6 +1,4 @@
-"""Implementations of different data feeders to provide data for TF trainer."""
-
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -14,6 +12,8 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+"""Implementations of different data feeders to provide data for TF trainer."""
+
 # TODO(ipolosukhin): Replace this module with feed-dict queue runners & queues.
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/learn/python/learn/models.py b/tensorflow/contrib/learn/python/learn/models.py
index 8cabd390fc7..dddd152f368 100644
--- a/tensorflow/contrib/learn/python/learn/models.py
+++ b/tensorflow/contrib/learn/python/learn/models.py
@@ -1,5 +1,4 @@
-"""Various high level TF models."""
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,13 +11,16 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+
+"""Various high level TF models."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.learn.python.learn.ops import autoencoder_ops
 from tensorflow.contrib.learn.python.learn.ops import dnn_ops
 from tensorflow.contrib.learn.python.learn.ops import losses_ops
-from tensorflow.contrib.learn.python.learn.ops import autoencoder_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops as array_ops_
@@ -29,8 +31,7 @@ from tensorflow.python.ops import variable_scope as vs
 
 
 def linear_regression_zero_init(X, y):
-  """Creates a linear regression TensorFlow subgraph, in which weights and
-    bias terms are initialized to exactly zero.
+  """Linear regression subgraph with zero-value initial weights and bias.
 
   Args:
     X: tensor or placeholder for input features.
@@ -43,8 +44,7 @@ def linear_regression_zero_init(X, y):
 
 
 def logistic_regression_zero_init(X, y):
-  """Creates a logistic regression TensorFlow subgraph, in which weights and
-       bias terms are initialized to exactly zero.
+  """Logistic regression subgraph with zero-value initial weights and bias.
 
   Args:
     X: tensor or placeholder for input features.
@@ -85,7 +85,7 @@ def linear_regression(X, y, init_mean=None, init_stddev=1.0):
     else:
       output_shape = y_shape[1]
     # Set up the requested initialization.
-    if (init_mean is None):
+    if init_mean is None:
       weights = vs.get_variable('weights', [X.get_shape()[1], output_shape])
       bias = vs.get_variable('bias', [output_shape])
     else:
@@ -134,7 +134,7 @@ def logistic_regression(X,
     logging_ops.histogram_summary('logistic_regression.X', X)
     logging_ops.histogram_summary('logistic_regression.y', y)
     # Set up the requested initialization.
-    if (init_mean is None):
+    if init_mean is None:
       weights = vs.get_variable('weights',
                                 [X.get_shape()[1], y.get_shape()[-1]])
       bias = vs.get_variable('bias', [y.get_shape()[-1]])
@@ -188,35 +188,37 @@ def get_dnn_model(hidden_units, target_predictor_fn, dropout=None):
 
   return dnn_estimator
 
+
 def get_autoencoder_model(hidden_units, target_predictor_fn,
                           activation, add_noise=None, dropout=None):
-    """Returns a function that creates a Autoencoder TensorFlow subgraph with given
-    params.
+  """Returns a function that creates a Autoencoder TensorFlow subgraph.
 
-    Args:
-        hidden_units: List of values of hidden units for layers.
-        target_predictor_fn: Function that will predict target from input
-                             features. This can be logistic regression,
-                             linear regression or any other model,
-                             that takes X, y and returns predictions and loss tensors.
-        activation: activation function used to map inner latent layer onto
-                    reconstruction layer.
-        add_noise: a function that adds noise to tensor_in, 
-               e.g. def add_noise(x):
-                        return(x + np.random.normal(0, 0.1, (len(x), len(x[0]))))
-        dropout: When not none, causes dropout regularization to be used,
-                 with the specified probability of removing a given coordinate.
+  Args:
+    hidden_units: List of values of hidden units for layers.
+    target_predictor_fn: Function that will predict target from input
+                         features. This can be logistic regression,
+                         linear regression or any other model,
+                         that takes X, y and returns predictions and loss
+                         tensors.
+    activation: activation function used to map inner latent layer onto
+                reconstruction layer.
+    add_noise: a function that adds noise to tensor_in,
+           e.g. def add_noise(x):
+                    return(x + np.random.normal(0, 0.1, (len(x), len(x[0]))))
+    dropout: When not none, causes dropout regularization to be used,
+             with the specified probability of removing a given coordinate.
+
+  Returns:
+      A function that creates the subgraph.
+  """
+  def dnn_autoencoder_estimator(X):
+    """Autoencoder estimator with target predictor function on top."""
+    encoder, decoder = autoencoder_ops.dnn_autoencoder(
+        X, hidden_units, activation,
+        add_noise=add_noise, dropout=dropout)
+    return encoder, decoder, target_predictor_fn(X, decoder)
+  return dnn_autoencoder_estimator
 
-    Returns:
-        A function that creates the subgraph.
-    """
-    def dnn_autoencoder_estimator(X):
-        """Autoencoder estimator with target predictor function on top."""
-        encoder, decoder = autoencoder_ops.dnn_autoencoder(
-          X, hidden_units, activation,
-          add_noise=add_noise, dropout=dropout)
-        return encoder, decoder, target_predictor_fn(X, decoder)
-    return dnn_autoencoder_estimator
 
 ## This will be in Tensorflow 0.7.
 ## TODO(ilblackdragon): Clean this up when it's released
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index 861db1758f5..79c629d9491 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -1,5 +1,4 @@
-"""Monitors to track model training, report on progress and request early stopping"""
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -13,6 +12,8 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+"""Monitors to track training, report progress and request early stopping."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
index c03cb27df50..c911290f28b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
@@ -81,6 +81,9 @@ class GPUAllocatorRetryTest : public ::testing::Test {
                   return;
                 }
               }
+              // Failures are more likely to occur if each consumer
+              // delays for a while before returning the memory.
+              Env::Default()->SleepForMicroseconds(500);
               ++consumer_count_[i];
               for (int j = 0; j < cap_needed; ++j) {
                 alloc_->DeallocateRaw(ptr);
@@ -141,9 +144,10 @@ TEST_F(GPUAllocatorRetryTest, RetrySuccess) {
   EXPECT_GT(consumer_count_[2], 0);
 }
 
-/* Disabled due to flakiness.  b/24738751
 // Verifies OutOfMemory failure when memory is slightly overcommitted
-// and retry is not allowed.
+// and retry is not allowed.  Note that this test will fail, i.e. no
+// memory alloc failure will be detected, if it is run in a context that
+// does not permit real multi-threaded execution.
 TEST_F(GPUAllocatorRetryTest, NoRetryFail) {
   // Support up to 2 allocations simultaneously, waits up to 0 msec for
   // a chance to alloc.
@@ -162,7 +166,6 @@ TEST_F(GPUAllocatorRetryTest, NoRetryFail) {
     EXPECT_TRUE(has_failed_);
   }
 }
-*/
 
 // Verifies OutOfMemory failure when retry is allowed but memory capacity
 // is too low even for retry.
diff --git a/tensorflow/core/graph/dot.cc b/tensorflow/core/graph/dot.cc
index 799bbe71475..a546b84ee13 100644
--- a/tensorflow/core/graph/dot.cc
+++ b/tensorflow/core/graph/dot.cc
@@ -32,7 +32,7 @@ static string GraphNodeName(const DotOptions& opts, const Node* n) {
   return strings::StrCat("N", n->id());
 }
 
-bool ShoulDisplayOpType(const Node* n) {
+bool ShouldDisplayOpType(const Node* n) {
   if (n->type_string() == "NoOp") {
     return false;
   }
@@ -125,7 +125,7 @@ string DotGraph(const Graph& g, const DotOptions& opts) {
       continue;
     }
     string label = src->name();
-    if (ShoulDisplayOpType(src)) {
+    if (ShouldDisplayOpType(src)) {
       // Append the op type if it is not directly deducible from the op name.
       strings::StrAppend(&label, "\\n(", src->type_string(), ")");
     }
@@ -137,7 +137,14 @@ string DotGraph(const Graph& g, const DotOptions& opts) {
       shape = "oval";
     } else {
       const string& d = src->assigned_device_name();
-      const int dindex = (!d.empty()) ? device_index[d] : -1;
+
+      int dindex;
+      if (opts.node_color) {
+        dindex = opts.node_color(src);
+      } else {
+        dindex = (!d.empty()) ? device_index[d] : -1;
+      }
+
       if (dindex >= 0) {
         color = ColorFor(dindex);
       }
diff --git a/tensorflow/core/graph/dot.h b/tensorflow/core/graph/dot.h
index 79a538978a8..96e48773a9a 100644
--- a/tensorflow/core/graph/dot.h
+++ b/tensorflow/core/graph/dot.h
@@ -48,6 +48,11 @@ struct DotOptions {
   // A function that returns the "cost" of the edge.  The dot display
   // makes a edge thickness proportional to its cost.
   std::function<double(const Edge*)> edge_cost;
+
+  // A function that returns a color number to apply to each node. < 0 means
+  // no color. A color will be assigned to each color number from a palette;
+  // adjacent color numbers will receive different colors.
+  std::function<int(const Node*)> node_color;
 };
 
 // Return a string that contains a graphviz specification of the graph.
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index 36cd60a9da0..9af6dfb8d5a 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -76,7 +76,7 @@ class ConcatOp : public OpKernel {
     for (int d = 0; d < concat_dim; ++d) {
       inputs_flat_dim0 *= input_shape.dim_size(d);
     }
-    int output_concat_dim = 0;
+    int64 output_concat_dim = 0;
     const bool input_is_scalar = IsLegacyScalar(input_shape);
     for (int i = 0; i < N; ++i) {
       const auto in = values[i];
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index a99bb6a092d..774436bacd8 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -61,6 +61,7 @@ class TensorContractionInputMapper<
   typedef SubMapper LinearMapper;
   typedef typename packet_traits<Scalar>::type Packet;
 
+  EIGEN_DEVICE_FUNC
   TensorContractionInputMapper(
       const TensorEvaluator<
           const TensorReshapingOp<
@@ -77,7 +78,7 @@ class TensorContractionInputMapper<
       m_patch_cols = tensor.impl().dimensions()[2];
       m_num_patches = tensor.impl().dimensions()[3];
     } else {
-      static const int NumDims = tensor.impl().dimensions().size();
+      const int NumDims = tensor.impl().dimensions().size();
       patch_depth = tensor.impl().dimensions()[NumDims - 1];
       patch_rows = tensor.impl().dimensions()[NumDims - 2];
       m_patch_cols = tensor.impl().dimensions()[NumDims - 3];
@@ -99,7 +100,7 @@ class TensorContractionInputMapper<
       m_inputRows = tensor.impl().impl().dimensions()[1];
       m_inputCols = tensor.impl().impl().dimensions()[2];
     } else {
-      static const int NumDims = tensor.impl().impl().dimensions().size();
+      const int NumDims = tensor.impl().impl().dimensions().size();
       m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2];
       m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3];
     }
@@ -121,6 +122,7 @@ class TensorContractionInputMapper<
     m_fastDimZero = internal::TensorIntDivisor<Index>(patch_depth);
   }
 
+  EIGEN_DEVICE_FUNC
   TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper)
       : m_impl(base_mapper.m_impl) {
     m_patch_cols = base_mapper.m_patch_cols;
@@ -650,8 +652,10 @@ struct gemm_pack_rhs<
       SubMapper;
   typedef SubMapper DataMapper;
 
+  EIGEN_DEVICE_FUNC
   static inline Index ceil_div(Index a, Index b) { return (a + b - 1) / b; }
 
+  EIGEN_DEVICE_FUNC
   EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
                                     Index depth, Index cols, Index stride = 0,
                                     Index offset = 0) const {
@@ -822,8 +826,10 @@ struct gemm_pack_rhs<
       SubMapper;
   typedef SubMapper DataMapper;
 
+  EIGEN_DEVICE_FUNC
   static inline Index ceil_div(Index a, Index b) { return (a + b - 1) / b; }
 
+  EIGEN_DEVICE_FUNC
   EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs,
                                     Index depth, Index cols, Index stride = 0,
                                     Index offset = 0) const {
@@ -898,36 +904,40 @@ struct gemm_pack_rhs<
   *
   */
 template <typename Input, typename Kernel>
-EIGEN_ALWAYS_INLINE static const typename internal::conditional<
-    internal::traits<Input>::Layout == ColMajor,
-    TensorReshapingOp<
-        const DSizes<typename internal::traits<Input>::Index,
-                     internal::traits<Input>::NumDimensions>,
-        const TensorContractionOp<
-            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
-            const TensorReshapingOp<
-                const DSizes<typename internal::traits<Input>::Index, 2>,
-                const Kernel>,
-            const TensorReshapingOp<
-                const DSizes<typename internal::traits<Input>::Index, 2>,
-                const TensorImagePatchOp<Dynamic, Dynamic, const Input> > > >,
-    TensorReshapingOp<
-        const DSizes<typename internal::traits<Input>::Index,
-                     internal::traits<Input>::NumDimensions>,
-        const TensorContractionOp<
-            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
-            const TensorReshapingOp<
-                const DSizes<typename internal::traits<Input>::Index, 2>,
-                const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
-            const TensorReshapingOp<
-                const DSizes<typename internal::traits<Input>::Index, 2>,
-                const Kernel> > > >::type
-SpatialConvolution(const Input& input, const Kernel& kernel,
-                   const DenseIndex row_stride = 1,
-                   const DenseIndex col_stride = 1,
-                   const PaddingType padding_type = PADDING_SAME,
-                   const DenseIndex row_in_stride = 1,
-                   const DenseIndex col_in_stride = 1) {
+EIGEN_DEVICE_FUNC
+    EIGEN_ALWAYS_INLINE static const typename internal::conditional<
+        internal::traits<Input>::Layout == ColMajor,
+        TensorReshapingOp<
+            const DSizes<typename internal::traits<Input>::Index,
+                         internal::traits<Input>::NumDimensions>,
+            const TensorContractionOp<
+                const array<IndexPair<typename internal::traits<Input>::Index>,
+                            1>,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const Kernel>,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const TensorImagePatchOp<Dynamic, Dynamic,
+                                             const Input> > > >,
+        TensorReshapingOp<
+            const DSizes<typename internal::traits<Input>::Index,
+                         internal::traits<Input>::NumDimensions>,
+            const TensorContractionOp<
+                const array<IndexPair<typename internal::traits<Input>::Index>,
+                            1>,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const Kernel> > > >::type
+    SpatialConvolution(const Input& input, const Kernel& kernel,
+                       const DenseIndex row_stride = 1,
+                       const DenseIndex col_stride = 1,
+                       const PaddingType padding_type = PADDING_SAME,
+                       const DenseIndex row_in_stride = 1,
+                       const DenseIndex col_in_stride = 1) {
   typedef typename internal::traits<Input>::Index TensorIndex;
   TensorRef<Tensor<typename internal::traits<Input>::Scalar,
                    internal::traits<Input>::NumDimensions,
@@ -941,9 +951,9 @@ SpatialConvolution(const Input& input, const Kernel& kernel,
   EIGEN_STATIC_ASSERT(
       internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
       YOU_MADE_A_PROGRAMMING_MISTAKE);
-  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+  const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
 
-  static const int NumDims = internal::traits<Input>::NumDimensions;
+  const int NumDims = internal::traits<Input>::NumDimensions;
 
   // Number of filters to apply. This is the same as the output depth of the
   // result
diff --git a/tensorflow/core/kernels/random_shuffle_op.cc b/tensorflow/core/kernels/random_shuffle_op.cc
index d87883eae83..c81929de8d4 100644
--- a/tensorflow/core/kernels/random_shuffle_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_op.cc
@@ -46,6 +46,19 @@ static inline void RandomShuffle(Iter first, Iter last, Random& uniform) {
   }
 }
 
+template <class IntT, class InT, class OutT, class Random>
+static void IndexedShuffle(const int64 size, const InT& input_mat,
+                           OutT output_mat, Random& uniform) {
+  std::vector<IntT> permutation(size);
+  for (IntT i = 0; i < size; i++) {
+    permutation[i] = i;
+  }
+  RandomShuffle(permutation.begin(), permutation.end(), uniform);
+  for (IntT i = 0; i < size; i++) {
+    output_mat.template chip<0>(i) = input_mat.template chip<0>(permutation[i]);
+  }
+}
+
 template <typename T>
 class RandomShuffleOp : public OpKernel {
  public:
@@ -79,14 +92,10 @@ class RandomShuffleOp : public OpKernel {
                        context->allocate_output(0, input.shape(), &output));
         const auto input_mat = input.flat_outer_dims<T>();
         auto output_mat = output->flat_outer_dims<T>();
-        std::vector<int> permutation(size);
-        for (int i = 0; i < size; i++) {
-          permutation[i] = i;
-        }
-        RandomShuffle(permutation.begin(), permutation.end(), uniform);
-        for (int i = 0; i < size; i++) {
-          output_mat.template chip<0>(i) =
-              input_mat.template chip<0>(permutation[i]);
+        if (size < kint32max) {
+          IndexedShuffle<int32>(size, input_mat, output_mat, uniform);
+        } else {
+          IndexedShuffle<int64>(size, input_mat, output_mat, uniform);
         }
       }
     }
diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op.cc b/tensorflow/core/kernels/sparse_reduce_sum_op.cc
index 02b64c48479..20233b120d2 100644
--- a/tensorflow/core/kernels/sparse_reduce_sum_op.cc
+++ b/tensorflow/core/kernels/sparse_reduce_sum_op.cc
@@ -74,6 +74,14 @@ class SparseReduceSumOp : public OpKernel {
     std::vector<int32> axes(num_reduction_axes);
     std::copy_n(reduction_axes_t->flat<int32>().data(), num_reduction_axes,
                 axes.begin());
+    for (int i = 0; i < num_reduction_axes; ++i) {
+      int32 axis = axes[i];
+      OP_REQUIRES(
+          ctx, axis >= -ndims && axis < ndims,
+          errors::InvalidArgument("Invalid reduction dimension ", axis,
+                                  ", for input with ", ndims, " dimensions."));
+      axes[i] = (axes[i] + ndims) % ndims;
+    }
     std::sort(axes.begin(), axes.end());
 
     std::vector<int64> group_by_dims;
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index c8f4f8d25b0..378733f59b8 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -430,7 +430,8 @@ Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
 with length 1.
 
 If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.
+with a single element is returned.  Additionally, the axes can be negative,
+which are interpreted according to the indexing rules in Python.
 
 input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
   SparseTensor, possibly not in canonical ordering.
diff --git a/tensorflow/examples/skflow/boston.py b/tensorflow/examples/skflow/boston.py
index bf2066770c7..9d895bd8e38 100644
--- a/tensorflow/examples/skflow/boston.py
+++ b/tensorflow/examples/skflow/boston.py
@@ -1,4 +1,4 @@
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/tensorflow/examples/skflow/iris.py b/tensorflow/examples/skflow/iris.py
index c6c566b10fd..ea44428d541 100644
--- a/tensorflow/examples/skflow/iris.py
+++ b/tensorflow/examples/skflow/iris.py
@@ -1,4 +1,4 @@
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/tensorflow/examples/skflow/iris_custom_decay_dnn.py b/tensorflow/examples/skflow/iris_custom_decay_dnn.py
index f9c172725d9..b8b1a1dd140 100644
--- a/tensorflow/examples/skflow/iris_custom_decay_dnn.py
+++ b/tensorflow/examples/skflow/iris_custom_decay_dnn.py
@@ -1,4 +1,4 @@
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/tensorflow/examples/skflow/mnist.py b/tensorflow/examples/skflow/mnist.py
index 082ecb2f839..d1288a31e98 100644
--- a/tensorflow/examples/skflow/mnist.py
+++ b/tensorflow/examples/skflow/mnist.py
@@ -1,4 +1,4 @@
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
diff --git a/tensorflow/examples/skflow/resnet.py b/tensorflow/examples/skflow/resnet.py
index f1f39568d46..03a5d5e5191 100644
--- a/tensorflow/examples/skflow/resnet.py
+++ b/tensorflow/examples/skflow/resnet.py
@@ -1,4 +1,4 @@
-#  Copyright 2015-present The Scikit Flow Authors. All Rights Reserved.
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
@@ -12,147 +12,155 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-"""
-This example builds deep residual network for mnist data.
+"""This example builds deep residual network for mnist data.
+
 Reference Paper: http://arxiv.org/pdf/1512.03385.pdf
 
 Note that this is still a work-in-progress. Feel free to submit a PR
 to make this better.
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 from collections import namedtuple
 from math import sqrt
+import os
 
 from sklearn import metrics
 import tensorflow as tf
-from tensorflow.examples.tutorials.mnist import input_data
 from tensorflow.contrib import learn
+from tensorflow.examples.tutorials.mnist import input_data
 
 
 def res_net(x, y, activation=tf.nn.relu):
-    """Builds a residual network. Note that if the input tensor is 2D, it must be
-    square in order to be converted to a 4D tensor. 
+  """Builds a residual network.
 
-    Borrowed structure from here: https://github.com/pkmital/tensorflow_tutorials/blob/master/10_residual_network.py
+  Note that if the input tensor is 2D, it must be square in order to be
+  converted to a 4D tensor.
 
-    Args:
-        x: Input of the network
-        y: Output of the network
-        activation: Activation function to apply after each convolution
-    """
+  Borrowed structure from:
+  github.com/pkmital/tensorflow_tutorials/blob/master/10_residual_network.py
 
-    # Configurations for each bottleneck block
-    BottleneckBlock = namedtuple(
-        'BottleneckBlock', ['num_layers', 'num_filters', 'bottleneck_size'])
-    blocks = [BottleneckBlock(3, 128, 32),
-              BottleneckBlock(3, 256, 64),
-              BottleneckBlock(3, 512, 128),
-              BottleneckBlock(3, 1024, 256)]
+  Args:
+    x: Input of the network
+    y: Output of the network
+    activation: Activation function to apply after each convolution
 
-    input_shape = x.get_shape().as_list()
+  Returns:
+    Predictions and loss tensors.
+  """
 
-    # Reshape the input into the right shape if it's 2D tensor
-    if len(input_shape) == 2:
-        ndim = int(sqrt(input_shape[1]))
-        x = tf.reshape(x, [-1, ndim, ndim, 1])
+  # Configurations for each bottleneck block.
+  BottleneckBlock = namedtuple(
+      'BottleneckBlock', ['num_layers', 'num_filters', 'bottleneck_size'])
+  blocks = [BottleneckBlock(3, 128, 32),
+            BottleneckBlock(3, 256, 64),
+            BottleneckBlock(3, 512, 128),
+            BottleneckBlock(3, 1024, 256)]
 
-    # First convolution expands to 64 channels
-    with tf.variable_scope('conv_layer1'):
-        net = learn.ops.conv2d(x, 64, [7, 7], batch_norm=True,
-                                activation=activation, bias=False)
+  input_shape = x.get_shape().as_list()
 
-    # Max pool
-    net = tf.nn.max_pool(
-        net, [1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME')
+  # Reshape the input into the right shape if it's 2D tensor
+  if len(input_shape) == 2:
+    ndim = int(sqrt(input_shape[1]))
+    x = tf.reshape(x, [-1, ndim, ndim, 1])
 
-    # First chain of resnets
-    with tf.variable_scope('conv_layer2'):
-        net = learn.ops.conv2d(net, blocks[0].num_filters,
-                               [1, 1], [1, 1, 1, 1],
-                               padding='VALID', bias=True)
+  # First convolution expands to 64 channels
+  with tf.variable_scope('conv_layer1'):
+    net = learn.ops.conv2d(x, 64, [7, 7], batch_norm=True,
+                           activation=activation, bias=False)
 
-    # Create each bottleneck building block for each layer
-    for block_i, block in enumerate(blocks):
-        for layer_i in range(block.num_layers):
+  # Max pool
+  net = tf.nn.max_pool(
+      net, [1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME')
 
-            name = 'block_%d/layer_%d' % (block_i, layer_i)
+  # First chain of resnets
+  with tf.variable_scope('conv_layer2'):
+    net = learn.ops.conv2d(net, blocks[0].num_filters,
+                           [1, 1], [1, 1, 1, 1],
+                           padding='VALID', bias=True)
 
-            # 1x1 convolution responsible for reducing dimension
-            with tf.variable_scope(name + '/conv_in'):
-                conv = learn.ops.conv2d(net, block.bottleneck_size,
-                                         [1, 1], [1, 1, 1, 1],
-                                         padding='VALID',
-                                         activation=activation,
-                                         batch_norm=True,
-                                         bias=False)
+  # Create each bottleneck building block for each layer
+  for block_i, block in enumerate(blocks):
+    for layer_i in range(block.num_layers):
 
-            with tf.variable_scope(name + '/conv_bottleneck'):
-                conv = learn.ops.conv2d(conv, block.bottleneck_size,
-                                         [3, 3], [1, 1, 1, 1],
-                                         padding='SAME',
-                                         activation=activation,
-                                         batch_norm=True,
-                                         bias=False)
+      name = 'block_%d/layer_%d' % (block_i, layer_i)
 
-            # 1x1 convolution responsible for restoring dimension
-            with tf.variable_scope(name + '/conv_out'):
-                conv = learn.ops.conv2d(conv, block.num_filters,
-                                         [1, 1], [1, 1, 1, 1],
-                                         padding='VALID',
-                                         activation=activation,
-                                         batch_norm=True,
-                                         bias=False)
+      # 1x1 convolution responsible for reducing dimension
+      with tf.variable_scope(name + '/conv_in'):
+        conv = learn.ops.conv2d(net, block.bottleneck_size,
+                                [1, 1], [1, 1, 1, 1],
+                                padding='VALID',
+                                activation=activation,
+                                batch_norm=True,
+                                bias=False)
 
-            # shortcut connections that turn the network into its counterpart
-            # residual function (identity shortcut)
-            net = conv + net
+      with tf.variable_scope(name + '/conv_bottleneck'):
+        conv = learn.ops.conv2d(conv, block.bottleneck_size,
+                                [3, 3], [1, 1, 1, 1],
+                                padding='SAME',
+                                activation=activation,
+                                batch_norm=True,
+                                bias=False)
 
-        try:
-            # upscale to the next block size
-            next_block = blocks[block_i + 1]
-            with tf.variable_scope('block_%d/conv_upscale' % block_i):
-                net = learn.ops.conv2d(net, next_block.num_filters,
-                                        [1, 1], [1, 1, 1, 1],
-                                        bias=False,
-                                        padding='SAME')
-        except IndexError:
-            pass
+      # 1x1 convolution responsible for restoring dimension
+      with tf.variable_scope(name + '/conv_out'):
+        conv = learn.ops.conv2d(conv, block.num_filters,
+                                [1, 1], [1, 1, 1, 1],
+                                padding='VALID',
+                                activation=activation,
+                                batch_norm=True,
+                                bias=False)
 
-    net_shape = net.get_shape().as_list()
-    net = tf.nn.avg_pool(net,
-                         ksize=[1, net_shape[1], net_shape[2], 1],
-                         strides=[1, 1, 1, 1], padding='VALID')
+      # shortcut connections that turn the network into its counterpart
+      # residual function (identity shortcut)
+      net = conv + net
 
-    net_shape = net.get_shape().as_list()
-    net = tf.reshape(net, [-1, net_shape[1] * net_shape[2] * net_shape[3]])
+      try:
+        # upscale to the next block size
+        next_block = blocks[block_i + 1]
+        with tf.variable_scope('block_%d/conv_upscale' % block_i):
+          net = learn.ops.conv2d(net, next_block.num_filters,
+                                 [1, 1], [1, 1, 1, 1],
+                                 bias=False,
+                                 padding='SAME')
+      except IndexError:
+        pass
 
-    return learn.models.logistic_regression(net, y)
+  net_shape = net.get_shape().as_list()
+  net = tf.nn.avg_pool(net,
+                       ksize=[1, net_shape[1], net_shape[2], 1],
+                       strides=[1, 1, 1, 1], padding='VALID')
+
+  net_shape = net.get_shape().as_list()
+  net = tf.reshape(net, [-1, net_shape[1] * net_shape[2] * net_shape[3]])
+
+  return learn.models.logistic_regression(net, y)
 
 
 # Download and load MNIST data.
 mnist = input_data.read_data_sets('MNIST_data')
 
 # Restore model if graph is saved into a folder.
-if os.path.exists("models/resnet/graph.pbtxt"):
-    classifier = learn.TensorFlowEstimator.restore("models/resnet/")
+if os.path.exists('models/resnet/graph.pbtxt'):
+  classifier = learn.TensorFlowEstimator.restore('models/resnet/')
 else:
-    # Create a new resnet classifier.
-    classifier = learn.TensorFlowEstimator(
-        model_fn=res_net, n_classes=10, batch_size=100, steps=100,
-        learning_rate=0.001, continue_training=True)
+  # Create a new resnet classifier.
+  classifier = learn.TensorFlowEstimator(
+      model_fn=res_net, n_classes=10, batch_size=100, steps=100,
+      learning_rate=0.001, continue_training=True)
 
 while True:
-    # Train model and save summaries into logdir.
-    classifier.fit(mnist.train.images, mnist.train.labels, logdir="models/resnet/")
+  # Train model and save summaries into logdir.
+  classifier.fit(
+      mnist.train.images, mnist.train.labels, logdir='models/resnet/')
 
-    # Calculate accuracy.
-    score = metrics.accuracy_score(
-        mnist.test.labels, classifier.predict(mnist.test.images, batch_size=64))
-    print('Accuracy: {0:f}'.format(score))
+  # Calculate accuracy.
+  score = metrics.accuracy_score(
+      mnist.test.labels, classifier.predict(mnist.test.images, batch_size=64))
+  print('Accuracy: {0:f}'.format(score))
 
-    # Save model graph and checkpoints.
-    classifier.save("models/resnet/")
+  # Save model graph and checkpoints.
+  classifier.save('models/resnet/')
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/tf.parse_example.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/tf.parse_example.md
index 9a7476475ec..2f2f5111963 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/tf.parse_example.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/tf.parse_example.md
@@ -74,7 +74,7 @@ example_names: ["input0", "input1"],
 features: {
     "kw": VarLenFeature(tf.string),
     "dank": VarLenFeature(tf.int64),
-    "gps": VarLenFeature(tf.float),
+    "gps": VarLenFeature(tf.float32),
 }
 ```
 
diff --git a/tensorflow/g3doc/api_docs/python/io_ops.md b/tensorflow/g3doc/api_docs/python/io_ops.md
index 127b461e4d2..61d01910524 100644
--- a/tensorflow/g3doc/api_docs/python/io_ops.md
+++ b/tensorflow/g3doc/api_docs/python/io_ops.md
@@ -1289,7 +1289,7 @@ example_names: ["input0", "input1"],
 features: {
     "kw": VarLenFeature(tf.string),
     "dank": VarLenFeature(tf.int64),
-    "gps": VarLenFeature(tf.float),
+    "gps": VarLenFeature(tf.float32),
 }
 ```
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 37178746933..9f28ad8b64e 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1916,10 +1916,18 @@ class Graph(object):
 
   def __init__(self):
     """Creates a new, empty Graph."""
-    self._nodes_by_id = dict()
-    self._next_node_id = [dict()]
-    self._next_id_counter = 0
-    self._nodes_by_name = dict()
+    # Protects the core state that may be accessed by multiple readers.
+    # Only state that can be returned via public accessors (`as_graph_def()`,
+    # `get_operations()`, `as_graph_element()`, `get_collection()`, and
+    # `get_collection_ref()`) is by the lock. Thread-safety is provided on a
+    # best-effort basis to support buggy programs, and is not guaranteed by the
+    # public `tf.Graph` API.
+    # NOTE(mrry): This does not protect the various stacks. A warning will
+    # be reported if these are used from multiple threads
+    self._lock = threading.Lock()
+    self._nodes_by_id = dict()  # GUARDED_BY(self._lock)
+    self._next_id_counter = 0  # GUARDED_BY(self._lock)
+    self._nodes_by_name = dict()  # GUARDED_BY(self._lock)
     # Current name stack: uniquified names
     self._name_stack = ""
     # Maps a name used in the graph to the next id to use for that name.
@@ -1987,15 +1995,15 @@ class Graph(object):
     self._check_not_finalized()
     if not isinstance(op, (Tensor, Operation)):
       raise TypeError("op must be a Tensor or Operation: %s" % op)
-
-    if op._id in self._nodes_by_id:
-      raise ValueError("cannot add an op with id %d as it already "
-                       "exists in the graph" % op._id)
-    if op.name in self._nodes_by_name:
-      raise ValueError("cannot add op with name %s as that name "
-                       "is already used" % op.name)
-    self._nodes_by_id[op._id] = op
-    self._nodes_by_name[op.name] = op
+    with self._lock:
+      if op._id in self._nodes_by_id:
+        raise ValueError("cannot add an op with id %d as it already "
+                         "exists in the graph" % op._id)
+      if op.name in self._nodes_by_name:
+        raise ValueError("cannot add op with name %s as that name "
+                         "is already used" % op.name)
+      self._nodes_by_id[op._id] = op
+      self._nodes_by_name[op.name] = op
 
   @property
   def version(self):
@@ -2081,31 +2089,32 @@ class Graph(object):
     Raises:
       ValueError: If the `graph_def` would be too large.
     """
-    graph = graph_pb2.GraphDef()
-    graph.versions.CopyFrom(self._graph_def_versions)
-    bytesize = 0
-    for op_id in sorted(self._nodes_by_id):
-      op = self._nodes_by_id[op_id]
-      if from_version is None or op_id > from_version:
-        graph.node.extend([op.node_def])
-        if op.outputs and add_shapes:
-          assert "_output_shapes" not in graph.node[-1].attr
-          graph.node[-1].attr["_output_shapes"].list.shape.extend([
-              output.get_shape().as_proto() for output in op.outputs])
-        bytesize += op.node_def.ByteSize()
-        if bytesize >= (1 << 31) or bytesize < 0:
-          raise ValueError("GraphDef cannot be larger than 2GB.")
-    if self._functions:
-      for f in self._functions.values():
-        bytesize += f.ByteSize()
-        if bytesize >= (1 << 31) or bytesize < 0:
-          raise ValueError("GraphDef cannot be larger than 2GB.")
-      graph.library.function.extend(self._functions.values())
-      for func in self._function_gradient:
-        grad_def = function_pb2.GradientDef()
-        grad_def.function_name = func
-        grad_def.gradient_func = self._function_gradient[func]
-        graph.library.gradient.extend([grad_def])
+    with self._lock:
+      graph = graph_pb2.GraphDef()
+      graph.versions.CopyFrom(self._graph_def_versions)
+      bytesize = 0
+      for op_id in sorted(self._nodes_by_id):
+        op = self._nodes_by_id[op_id]
+        if from_version is None or op_id > from_version:
+          graph.node.extend([op.node_def])
+          if op.outputs and add_shapes:
+            assert "_output_shapes" not in graph.node[-1].attr
+            graph.node[-1].attr["_output_shapes"].list.shape.extend([
+                output.get_shape().as_proto() for output in op.outputs])
+          bytesize += op.node_def.ByteSize()
+          if bytesize >= (1 << 31) or bytesize < 0:
+            raise ValueError("GraphDef cannot be larger than 2GB.")
+      if self._functions:
+        for f in self._functions.values():
+          bytesize += f.ByteSize()
+          if bytesize >= (1 << 31) or bytesize < 0:
+            raise ValueError("GraphDef cannot be larger than 2GB.")
+        graph.library.function.extend(self._functions.values())
+        for func in self._function_gradient:
+          grad_def = function_pb2.GradientDef()
+          grad_def.function_name = func
+          grad_def.gradient_func = self._function_gradient[func]
+          graph.library.gradient.extend([grad_def])
 
     return graph
 
@@ -2298,7 +2307,11 @@ class Graph(object):
         example, an invalid string.
       KeyError: If `obj` is not an object in the graph.
     """
+    with self._lock:
+      return self._as_graph_element_locked(obj, allow_tensor, allow_operation)
 
+  def _as_graph_element_locked(self, obj, allow_tensor, allow_operation):
+    """See `Graph.as_graph_element()` for details."""
     # The vast majority of this function is figuring
     # out what an API user might be doing wrong, so
     # that we can give helpful error messages.
@@ -2398,7 +2411,8 @@ class Graph(object):
     Returns:
       A list of Operations.
     """
-    return list(self._nodes_by_id.values())
+    with self._lock:
+      return list(self._nodes_by_id.values())
 
   def get_operation_by_name(self, name):
     """Returns the `Operation` with the given `name`.
@@ -2445,8 +2459,9 @@ class Graph(object):
   def _next_id(self):
     """Id for next Operation instance. Also increments the internal id."""
     self._check_not_finalized()
-    self._next_id_counter += 1
-    return self._next_id_counter
+    with self._lock:
+      self._next_id_counter += 1
+      return self._next_id_counter
 
   @property
   def _last_id(self):
@@ -2499,10 +2514,11 @@ class Graph(object):
       value: The value to add to the collection.
     """
     self._check_not_finalized()
-    if name not in self._collections:
-      self._collections[name] = [value]
-    else:
-      self._collections[name].append(value)
+    with self._lock:
+      if name not in self._collections:
+        self._collections[name] = [value]
+      else:
+        self._collections[name].append(value)
 
   def add_to_collections(self, names, value):
     """Stores `value` in the collections given by `names`.
@@ -2543,11 +2559,12 @@ class Graph(object):
       The list of values in the collection with the given `name`, or an empty
       list if no value has been added to that collection.
     """
-    coll_list = self._collections.get(name, None)
-    if coll_list is None:
-      coll_list = []
-      self._collections[name] = coll_list
-    return coll_list
+    with self._lock:
+      coll_list = self._collections.get(name, None)
+      if coll_list is None:
+        coll_list = []
+        self._collections[name] = coll_list
+      return coll_list
 
   def get_collection(self, name, scope=None):
     """Returns a list of values in the collection with the given `name`.
@@ -2571,22 +2588,24 @@ class Graph(object):
       list contains the values in the order under which they were
       collected.
     """
-    coll_list = self._collections.get(name, None)
-    if coll_list is None:
-      return []
-    if scope is None:
-      return list(coll_list)
-    else:
-      c = []
-      regex = re.compile(scope)
-      for item in coll_list:
-        if hasattr(item, "name") and regex.match(item.name):
-          c.append(item)
-      return c
+    with self._lock:
+      coll_list = self._collections.get(name, None)
+      if coll_list is None:
+        return []
+      if scope is None:
+        return list(coll_list)
+      else:
+        c = []
+        regex = re.compile(scope)
+        for item in coll_list:
+          if hasattr(item, "name") and regex.match(item.name):
+            c.append(item)
+        return c
 
   def get_all_collection_keys(self):
     """Returns a list of collections used in this graph."""
-    return [x for x in self._collections if isinstance(x, six.string_types)]
+    with self._lock:
+      return [x for x in self._collections if isinstance(x, six.string_types)]
 
   @contextlib.contextmanager
   def _original_op(self, op):
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index 038799681cb..97452a791d0 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -412,6 +412,17 @@ class ConcatOpTest(tf.test.TestCase):
       self.assertEqual(n + 3, after - before)
       print("graph = ", [x.name for x in g.get_operations()])
 
+  def testConcatLargeTensors(self):
+    # CPU-only test, because it fails on GPUs with <= 4GB memory.
+    with tf.device("/cpu:0"):
+      a = tf.ones([2**31 + 6], dtype=tf.int8)
+      b = tf.zeros([1024], dtype=tf.int8)
+      onezeros = tf.concat(0, [a, b])
+    with self.test_session(use_gpu=False):
+      # TODO(dga):  Add more depth to this test to validate correctness,
+      # not just non-crashingness, once other large tensor fixes have gone in.
+      _ = onezeros.eval()
+
 
 class ConcatOffsetTest(tf.test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 595f3f41204..6c817a5da80 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -158,14 +158,14 @@ class MatMulTest(tf.test.TestCase):
 
   def testComplex64Random(self):
     for _ in range(10):
-      n, k, m = np.random.randint(1, 100, size=3)
+      n, k, m = np.random.randint(1, 10, size=3)  # Smaller range than float
       x = self._randMatrix(n, k, np.complex64)
       y = self._randMatrix(k, m, np.complex64)
       self._testCpuMatmul(x, y)
 
   def testComplex128Random(self):
     for _ in range(10):
-      n, k, m = np.random.randint(1, 100, size=3)
+      n, k, m = np.random.randint(1, 10, size=3)  # Smaller range than float
       x = self._randMatrix(n, k, np.complex128)
       y = self._randMatrix(k, m, np.complex128)
       self._testCpuMatmul(x, y)
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 6b046883d4d..037d1f2c3eb 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -417,16 +417,27 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
 
 class SparseReduceSumTest(test_util.TensorFlowTestCase):
 
-  def _compare(self, sp_t, reduction_axes, keep_dims):
+  # [[1, ?, 1]
+  #  [?, 1, ?]]
+  # where ? is implictly-zero.
+  ind = np.array([[0, 0], [0, 2], [1, 1]]).astype(np.int64)
+  vals = np.array([1, 1, 1]).astype(np.int32)
+  shape = np.array([2, 3]).astype(np.int64)
+
+  def _compare(self, sp_t, reduction_axes, ndims, keep_dims):
     densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
 
     np_ans = densified
     if reduction_axes is None:
       np_ans = np.sum(np_ans, keepdims=keep_dims)
     else:
-      if isinstance(reduction_axes, list):
-        reduction_axes = sorted(reduction_axes)  # loop below depends on sorted
+      if not isinstance(reduction_axes, list):  # Single scalar.
+        reduction_axes = [reduction_axes]
       reduction_axes = np.array(reduction_axes).astype(np.int32)
+      # Handles negative axes.
+      reduction_axes = (reduction_axes + ndims) % ndims
+      # Loop below depends on sorted.
+      reduction_axes.sort()
       for ra in reduction_axes.ravel()[::-1]:
         np_ans = np.sum(np_ans, axis=ra, keepdims=keep_dims)
 
@@ -436,25 +447,21 @@ class SparseReduceSumTest(test_util.TensorFlowTestCase):
 
     self.assertAllClose(np_ans, out)
 
-  def _compare_all(self, sp_t, reduction_axes):
-    self._compare(sp_t, reduction_axes, False)
-    self._compare(sp_t, reduction_axes, True)
+  def _compare_all(self, sp_t, reduction_axes, ndims):
+    self._compare(sp_t, reduction_axes, ndims, False)
+    self._compare(sp_t, reduction_axes, ndims, True)
 
   def testSimpleAndRandomInputs(self):
-    # [[1, ?, 1]
-    #  [?, 1, ?]]
-    # where ? is implictly-zero.
-    ind = np.array([[0, 0], [0, 2], [1, 1]]).astype(np.int64)
-    vals = np.array([1, 1, 1]).astype(np.int32)
-    shape = np.array([2, 3]).astype(np.int64)
-    sp_t = ops.SparseTensor(ind, vals, shape)
+    sp_t = ops.SparseTensor(self.ind, self.vals, self.shape)
 
     with self.test_session(use_gpu=False):
-      self._compare_all(sp_t, None)
-      self._compare_all(sp_t, 0)
-      self._compare_all(sp_t, [1])
-      self._compare_all(sp_t, [0, 1])
-      self._compare_all(sp_t, [1, 0])
+      self._compare_all(sp_t, None, ndims=2)
+      self._compare_all(sp_t, 0, ndims=2)
+      self._compare_all(sp_t, [1], ndims=2)
+      self._compare_all(sp_t, [0, 1], ndims=2)
+      self._compare_all(sp_t, [1, 0], ndims=2)
+      self._compare_all(sp_t, [-1], ndims=2)
+      self._compare_all(sp_t, [1, -2], ndims=2)
 
     np.random.seed(1618)
     test_dims = [(1618, 1, 11, 7, 1), (1,), (1, 1, 1)]
@@ -462,11 +469,19 @@ class SparseReduceSumTest(test_util.TensorFlowTestCase):
       for dims in test_dims:
         sp_t, unused_nnz = _sparsify(np.random.randn(*dims))
         # reduce all using None
-        self._compare_all(sp_t, None)
+        self._compare_all(sp_t, None, ndims=len(dims))
         # reduce random axes from 1D to N-D
         for d in range(1, len(dims) + 1):
           axes = np.random.choice(len(dims), size=d, replace=False).tolist()
-          self._compare_all(sp_t, axes)
+          self._compare_all(sp_t, axes, ndims=len(dims))
+
+  def testInvalidAxes(self):
+    sp_t = ops.SparseTensor(self.ind, self.vals, self.shape)
+    with self.test_session(use_gpu=False):
+      with self.assertRaisesOpError("Invalid reduction dimension -3"):
+        sparse_ops.sparse_reduce_sum(sp_t, -3).eval()
+      with self.assertRaisesOpError("Invalid reduction dimension 2"):
+        sparse_ops.sparse_reduce_sum(sp_t, 2).eval()
 
   def testGradient(self):
     np.random.seed(8161)
@@ -483,6 +498,12 @@ class SparseReduceSumTest(test_util.TensorFlowTestCase):
                                                reduced.eval().shape)
           self.assertLess(err, 1e-3)
 
+        # Tests for negative axes.
+        reduced = sparse_ops.sparse_reduce_sum(sp_t, -1)
+        err = tf.test.compute_gradient_error(sp_t.values, (nnz,), reduced,
+                                             reduced.eval().shape)
+        self.assertLess(err, 1e-3)
+
 
 class SparseMathOpsTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index fa6696cbbc0..9d3a135cf0c 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -225,7 +225,7 @@ def parse_example(serialized, features, name=None, example_names=None):
   features: {
       "kw": VarLenFeature(tf.string),
       "dank": VarLenFeature(tf.int64),
-      "gps": VarLenFeature(tf.float),
+      "gps": VarLenFeature(tf.float32),
   }
   ```
 
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 4df0e9c5d8e..fbce1103fcc 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -548,7 +548,8 @@ def sparse_reduce_sum(sp_input, reduction_axes=None, keep_dims=False):
   with length 1.
 
   If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-  with a single element is returned.
+  with a single element is returned.  Additionally, the axes can be negative,
+  similar to the indexing rules in Python.
 
   For example:
 
@@ -558,7 +559,7 @@ def sparse_reduce_sum(sp_input, reduction_axes=None, keep_dims=False):
   # where ? is implictly-zero.
   tf.sparse_reduce_sum(x) ==> 3
   tf.sparse_reduce_sum(x, 0) ==> [1, 1, 1]
-  tf.sparse_reduce_sum(x, 1) ==> [2, 1]
+  tf.sparse_reduce_sum(x, 1) ==> [2, 1]  # Can also use -1 as the axis.
   tf.sparse_reduce_sum(x, 1, keep_dims=True) ==> [[2], [1]]
   tf.sparse_reduce_sum(x, [0, 1]) ==> 3
   ```
diff --git a/tensorflow/python/summary/event_accumulator.py b/tensorflow/python/summary/event_accumulator.py
index 2ee8a369f76..204ed009129 100644
--- a/tensorflow/python/summary/event_accumulator.py
+++ b/tensorflow/python/summary/event_accumulator.py
@@ -114,8 +114,7 @@ class EventAccumulator(object):
   `Accumulator.Scalars(tag)`) allow for the retrieval of all data
   associated with that tag.
 
-  Before usage, the `EventAccumulator` must be activated via `Reload()`. This
-  method synchronosly loads all of the data written so far.
+  The `Reload()` method synchronously loads all of the data written so far.
 
   Histograms, audio, and images are very large, so storing all of them is not
   recommended.
@@ -175,7 +174,6 @@ class EventAccumulator(object):
     self._compression_bps = compression_bps
     self.purge_orphaned_data = purge_orphaned_data
 
-    self._activated = False
     self.most_recent_step = -1
     self.most_recent_wall_time = -1
     self.file_version = None
@@ -188,12 +186,10 @@ class EventAccumulator(object):
     """Loads all events added since the last call to `Reload`.
 
     If `Reload` was never called, loads all events in the file.
-    Calling `Reload` activates the `EventAccumulator`.
 
     Returns:
       The `EventAccumulator`.
     """
-    self._activated = True
     with self._generator_mutex:
       for event in self._generator.Load():
         if event.HasField('file_version'):
@@ -232,13 +228,9 @@ class EventAccumulator(object):
   def Tags(self):
     """Return all tags found in the value stream.
 
-    Raises:
-      RuntimeError: If the `EventAccumulator` has not been activated.
-
     Returns:
       A `{tagType: ['list', 'of', 'tags']}` dictionary.
     """
-    self._VerifyActivated()
     return {IMAGES: self._images.Keys(),
             AUDIO: self._audio.Keys(),
             HISTOGRAMS: self._histograms.Keys(),
@@ -255,12 +247,10 @@ class EventAccumulator(object):
 
     Raises:
       KeyError: If the tag is not found.
-      RuntimeError: If the `EventAccumulator` has not been activated.
 
     Returns:
       An array of `ScalarEvent`s.
     """
-    self._VerifyActivated()
     return self._scalars.Items(tag)
 
   def Graph(self):
@@ -268,12 +258,10 @@ class EventAccumulator(object):
 
     Raises:
       ValueError: If there is no graph for this run.
-      RuntimeError: If the `EventAccumulator` has not been activated.
 
     Returns:
       The `graph_def` proto.
     """
-    self._VerifyActivated()
     if self._graph is None:
       raise ValueError('There is no graph in this EventAccumulator')
     graph = graph_pb2.GraphDef()
@@ -288,12 +276,10 @@ class EventAccumulator(object):
 
     Raises:
       ValueError: If the tag is not found.
-      RuntimeError: If the `EventAccumulator` has not been activated.
 
     Returns:
       The metadata in form of `RunMetadata` proto.
     """
-    self._VerifyActivated()
     if tag not in self._tagged_metadata:
       raise ValueError('There is no run metadata with this tag name')
 
@@ -309,12 +295,10 @@ class EventAccumulator(object):
 
     Raises:
       KeyError: If the tag is not found.
-      RuntimeError: If the `EventAccumulator` has not been activated.
 
     Returns:
       An array of `HistogramEvent`s.
     """
-    self._VerifyActivated()
     return self._histograms.Items(tag)
 
   def CompressedHistograms(self, tag):
@@ -325,12 +309,10 @@ class EventAccumulator(object):
 
     Raises:
       KeyError: If the tag is not found.
-      RuntimeError: If the `EventAccumulator` has not been activated.
 
     Returns:
       An array of `CompressedHistogramEvent`s.
     """
-    self._VerifyActivated()
     return self._compressed_histograms.Items(tag)
 
   def Images(self, tag):
@@ -341,12 +323,10 @@ class EventAccumulator(object):
 
     Raises:
       KeyError: If the tag is not found.
-      RuntimeError: If the `EventAccumulator` has not been activated.
 
     Returns:
       An array of `ImageEvent`s.
     """
-    self._VerifyActivated()
     return self._images.Items(tag)
 
   def Audio(self, tag):
@@ -357,12 +337,10 @@ class EventAccumulator(object):
 
     Raises:
       KeyError: If the tag is not found.
-      RuntimeError: If the `EventAccumulator` has not been activated.
 
     Returns:
       An array of `AudioEvent`s.
     """
-    self._VerifyActivated()
     return self._audio.Items(tag)
 
   def _MaybePurgeOrphanedData(self, event):
@@ -599,10 +577,6 @@ class EventAccumulator(object):
                                    event.wall_time, *expired_per_type)
       logging.warn(purge_msg)
 
-  def _VerifyActivated(self):
-    if not self._activated:
-      raise RuntimeError('Accumulator must be activated before it may be used.')
-
 
 def _GetPurgeMessage(most_recent_step, most_recent_wall_time, event_step,
                      event_wall_time, num_expired_scalars, num_expired_histos,
diff --git a/tensorflow/python/summary/event_accumulator_test.py b/tensorflow/python/summary/event_accumulator_test.py
index f6b60b91db9..b154d853322 100644
--- a/tensorflow/python/summary/event_accumulator_test.py
+++ b/tensorflow/python/summary/event_accumulator_test.py
@@ -456,18 +456,6 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     self.assertEqual(acc.Audio('snd1'), [snd1])
     self.assertEqual(acc.Audio('snd2'), [snd2])
 
-  def testActivation(self):
-    gen = _EventGenerator()
-    acc = ea.EventAccumulator(gen)
-    self.assertFalse(acc._activated)
-    with self.assertRaises(RuntimeError):
-      acc.Tags()
-    with self.assertRaises(RuntimeError):
-      acc.Scalars('s1')
-    acc.Reload()
-    self.assertTrue(acc._activated)
-    acc._activated = False
-
   def testKeyError(self):
     gen = _EventGenerator()
     acc = ea.EventAccumulator(gen)
diff --git a/tensorflow/python/summary/event_multiplexer.py b/tensorflow/python/summary/event_multiplexer.py
index a0f4ef402f3..00eab3d215d 100644
--- a/tensorflow/python/summary/event_multiplexer.py
+++ b/tensorflow/python/summary/event_multiplexer.py
@@ -113,8 +113,7 @@ class EventMultiplexer(object):
       accumulator.
 
     If `Reload` has been called, it will `Reload` the newly created
-    accumulators. This maintains the invariant that once the Multiplexer was
-    activated, all of its accumulators are active.
+    accumulators.
 
     Args:
       path: Path to the event files (or event directory) for given run.
@@ -199,7 +198,6 @@ class EventMultiplexer(object):
     Raises:
       KeyError: If the run is not found, or the tag is not available for
         the given run.
-      RuntimeError: If the run's `EventAccumulator` has not been activated.
 
     Returns:
       An array of `event_accumulator.ScalarEvents`.
@@ -216,7 +214,6 @@ class EventMultiplexer(object):
     Raises:
       KeyError: If the run is not found.
       ValueError: If the run does not have an associated graph.
-      RuntimeError: If the run's EventAccumulator has not been activated.
 
     Returns:
       The `graph_def` protobuf data structure.
@@ -234,7 +231,6 @@ class EventMultiplexer(object):
     Raises:
       KeyError: If the run is not found, or the tag is not available for the
         given run.
-      RuntimeError: If the run's EventAccumulator has not been activated.
 
     Returns:
       The metadata in the form of `RunMetadata` protobuf data structure.
@@ -252,7 +248,6 @@ class EventMultiplexer(object):
     Raises:
       KeyError: If the run is not found, or the tag is not available for
         the given run.
-      RuntimeError: If the run's `EventAccumulator` has not been activated.
 
     Returns:
       An array of `event_accumulator.HistogramEvents`.
@@ -270,7 +265,6 @@ class EventMultiplexer(object):
     Raises:
       KeyError: If the run is not found, or the tag is not available for
         the given run.
-      RuntimeError: If the run's EventAccumulator has not been activated.
 
     Returns:
       An array of `event_accumulator.CompressedHistogramEvents`.
@@ -288,7 +282,6 @@ class EventMultiplexer(object):
     Raises:
       KeyError: If the run is not found, or the tag is not available for
         the given run.
-      RuntimeError: If the run's `EventAccumulator` has not been activated.
 
     Returns:
       An array of `event_accumulator.ImageEvents`.
@@ -306,7 +299,6 @@ class EventMultiplexer(object):
     Raises:
       KeyError: If the run is not found, or the tag is not available for
         the given run.
-      RuntimeError: If the run's `EventAccumulator` has not been activated.
 
     Returns:
       An array of `event_accumulator.AudioEvents`.
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 8f2b3d1c7c2..15aeee645c6 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -184,6 +184,7 @@ bool IsCudnnR2() {
   __macro(cudnnSetStream)                                 \
   __macro(cudnnActivationForward)                         \
   __macro(cudnnConvolutionForward)                        \
+  __macro(cudnnConvolutionBackwardBias)                   \
   __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
   __macro(cudnnTransformTensor)                           \
   __macro(cudnnSetConvolutionNdDescriptor)                \
@@ -1493,6 +1494,72 @@ bool CudnnSupport::DoConvolveBackwardFilter(
       algorithm, output_profile_result);
 }
 
+template <class T>
+bool CudnnSupport::DoConvolveBackwardBiasImpl(
+    Stream* stream, int cudnn_type,  // Actually cudnnDataType_t.
+    const dnn::BatchDescriptor& input_descriptor,
+    const DeviceMemory<T>& input_data,
+    const dnn::BatchDescriptor& bias_descriptor,
+    DeviceMemory<T>* backward_bias_data) {
+  mutex_lock lock{dnn_handle_mutex_};
+  auto status = dynload::cudnnSetStream(parent_, ToHandle(dnn_handle_),
+                                        AsCUDAStreamValue(stream));
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
+  }
+
+  ScopedTensorDescriptor input_nd{parent_, input_descriptor,
+                                  static_cast<cudnnDataType_t>(cudnn_type)};
+  ScopedTensorDescriptor bias_nd{parent_, bias_descriptor,
+                                 static_cast<cudnnDataType_t>(cudnn_type)};
+
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  status = dynload::cudnnConvolutionBackwardBias(
+      parent_, ToHandle(dnn_handle_), &alpha, input_nd.handle(),
+      input_data.opaque(), &beta, bias_nd.handle(),
+      backward_bias_data->opaque());
+  if (status != CUDNN_STATUS_SUCCESS) {
+    LOG(FATAL) << "failed to enqueue backward convolution on stream: "
+               << ToString(status);
+    return false;
+  }
+  return true;
+}
+
+bool CudnnSupport::DoConvolveBackwardBias(
+    Stream* stream, const BatchDescriptor& input_descriptor,
+    const DeviceMemory<double>& input_data,
+    const BatchDescriptor& bias_descriptor,
+    DeviceMemory<double>* backward_bias_data) {
+  return DoConvolveBackwardBiasImpl(stream, CUDNN_DATA_DOUBLE, input_descriptor,
+                                    input_data, bias_descriptor,
+                                    backward_bias_data);
+}
+
+bool CudnnSupport::DoConvolveBackwardBias(
+    Stream* stream, const BatchDescriptor& input_descriptor,
+    const DeviceMemory<float>& input_data,
+    const BatchDescriptor& bias_descriptor,
+    DeviceMemory<float>* backward_bias_data) {
+  return DoConvolveBackwardBiasImpl(stream, CUDNN_DATA_FLOAT, input_descriptor,
+                                    input_data, bias_descriptor,
+                                    backward_bias_data);
+}
+
+bool CudnnSupport::DoConvolveBackwardBias(
+    Stream* stream, const BatchDescriptor& input_descriptor,
+    const DeviceMemory<Eigen::half>& input_data,
+    const BatchDescriptor& bias_descriptor,
+    DeviceMemory<Eigen::half>* backward_bias_data) {
+  return DoConvolveBackwardBiasImpl(stream, CUDNN_DATA_HALF, input_descriptor,
+                                    input_data, bias_descriptor,
+                                    backward_bias_data);
+}
+
 bool CudnnSupport::DoMatMul(Stream* stream,
                             const DeviceMemory<float>& input_data,
                             const DeviceMemory<float>& weights,
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 9388969770d..e3c9175e019 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -140,6 +140,24 @@ class CudnnSupport : public dnn::DnnSupport {
       ScratchAllocator* scratch_allocator, dnn::AlgorithmType algorithm,
       dnn::ProfileResult* output_profile_result) override;
 
+  bool DoConvolveBackwardBias(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<double>& input_data,
+      const dnn::BatchDescriptor& bias_descriptor,
+      DeviceMemory<double>* backward_bias_data) override;
+
+  bool DoConvolveBackwardBias(Stream* stream,
+                              const dnn::BatchDescriptor& input_descriptor,
+                              const DeviceMemory<float>& input_data,
+                              const dnn::BatchDescriptor& bias_descriptor,
+                              DeviceMemory<float>* backward_bias_data) override;
+
+  bool DoConvolveBackwardBias(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<Eigen::half>& input_data,
+      const dnn::BatchDescriptor& bias_descriptor,
+      DeviceMemory<Eigen::half>* backward_bias_data) override;
+
   bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data,
                 const DeviceMemory<float>& weights,
                 const dnn::BatchDescriptor& input_dimensions,
@@ -311,6 +329,14 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::AlgorithmType algorithm,
       dnn::ProfileResult* output_profile_result);
 
+  template <class T>
+  bool DoConvolveBackwardBiasImpl(Stream* stream,
+                                  int cudnn_type,  // Actually cudnnDataType_t.
+                                  const dnn::BatchDescriptor& input_descriptor,
+                                  const DeviceMemory<T>& input_data,
+                                  const dnn::BatchDescriptor& bias_descriptor,
+                                  DeviceMemory<T>* backward_bias_data);
+
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
 };
 
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 01c457c90c7..6eaadcadc20 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -849,6 +849,43 @@ class DnnSupport {
       ScratchAllocator* scratch_allocator, AlgorithmType algorithm,
       ProfileResult* output_profile_result) = 0;
 
+  // Enqueues a single-precision backward convolution (for bias) operation onto
+  // the stream.
+  //
+  // Arguments:
+  //  stream: borrowed pointer to the stream that the 'convolve' operation
+  //    should be enqueued onto.
+  //  input_descriptor: dimensions of the input layer.
+  //  input_data: un-owned device memory region which contains the
+  //    convolution input.
+  //  bias_descriptor: dimensions of the bias tensor. Should be the same as the
+  //    input dimensions, but with the spatial dimensions set to 1.
+  //  backward_filter_data: un-owned device memory region in which to place the
+  //    backprop of the bias.
+  virtual bool DoConvolveBackwardBias(Stream* stream,
+                                      const BatchDescriptor& input_descriptor,
+                                      const DeviceMemory<float>& input_data,
+                                      const BatchDescriptor& bias_descriptor,
+                                      DeviceMemory<float>* backward_bias_data) {
+    return false;
+  }
+
+  virtual bool DoConvolveBackwardBias(
+      Stream* stream, const BatchDescriptor& input_descriptor,
+      const DeviceMemory<double>& input_data,
+      const BatchDescriptor& bias_descriptor,
+      DeviceMemory<double>* backward_bias_data) {
+    return false;
+  }
+
+  virtual bool DoConvolveBackwardBias(
+      Stream* stream, const BatchDescriptor& input_descriptor,
+      const DeviceMemory<Eigen::half>& input_data,
+      const BatchDescriptor& bias_descriptor,
+      DeviceMemory<Eigen::half>* backward_bias_data) {
+    return false;
+  }
+
   // Fully connects the "nodes" (float values) in input_data with
   // shape input_dimensions to output_data with output_dimensions
   // using provided weights. This is equivalent to computing a matrix
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 57a10b84f3b..3d264989026 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -741,6 +741,57 @@ Stream &Stream::ThenConvolveBackwardFilter(
       /*scratch_allocator=*/nullptr);
 }
 
+template <typename T>
+Stream &Stream::ThenConvolveBackwardBiasImpl(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<T> &input_data,
+    const dnn::BatchDescriptor &bias_descriptor,
+    DeviceMemory<T> *backward_bias_data) {
+  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data), PARAM(bias_descriptor),
+            PARAM(backward_bias_data));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoConvolveBackwardBias(this, input_descriptor, input_data,
+                                             bias_descriptor,
+                                             backward_bias_data));
+    } else {
+      SetError();
+      LOG(WARNING)
+          << "attempting to perform DNN operation using StreamExecutor "
+             "without DNN support";
+    }
+  }
+  return *this;
+}
+
+Stream &Stream::ThenConvolveBackwardBias(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<double> &input_data,
+    const dnn::BatchDescriptor &bias_descriptor,
+    DeviceMemory<double> *backward_bias_data) {
+  return ThenConvolveBackwardBiasImpl(input_descriptor, input_data,
+                                      bias_descriptor, backward_bias_data);
+}
+
+Stream &Stream::ThenConvolveBackwardBias(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<float> &input_data,
+    const dnn::BatchDescriptor &bias_descriptor,
+    DeviceMemory<float> *backward_bias_data) {
+  return ThenConvolveBackwardBiasImpl(input_descriptor, input_data,
+                                      bias_descriptor, backward_bias_data);
+}
+
+Stream &Stream::ThenConvolveBackwardBias(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<Eigen::half> &input_data,
+    const dnn::BatchDescriptor &bias_descriptor,
+    DeviceMemory<Eigen::half> *backward_bias_data) {
+  return ThenConvolveBackwardBiasImpl(input_descriptor, input_data,
+                                      bias_descriptor, backward_bias_data);
+}
+
 Stream &Stream::ThenMatMul(const DeviceMemory<float> &input_data,
                            const DeviceMemory<float> &weights,
                            const dnn::BatchDescriptor &input_dimensions,
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index f5583d62215..b14bf06cdc8 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -371,6 +371,22 @@ class Stream {
       ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm,
       dnn::ProfileResult *output_profile_result);
 
+  Stream &ThenConvolveBackwardBias(const dnn::BatchDescriptor &input_descriptor,
+                                   const DeviceMemory<double> &input_data,
+                                   const dnn::BatchDescriptor &bias_descriptor,
+                                   DeviceMemory<double> *backward_bias_data);
+
+  Stream &ThenConvolveBackwardBias(const dnn::BatchDescriptor &input_descriptor,
+                                   const DeviceMemory<float> &input_data,
+                                   const dnn::BatchDescriptor &bias_descriptor,
+                                   DeviceMemory<float> *backward_bias_data);
+
+  Stream &ThenConvolveBackwardBias(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<Eigen::half> &input_data,
+      const dnn::BatchDescriptor &bias_descriptor,
+      DeviceMemory<Eigen::half> *backward_bias_data);
+
   Stream &ThenMatMul(const DeviceMemory<float> &input_data,
                      const DeviceMemory<float> &weights,
                      const dnn::BatchDescriptor &input_dimensions,
@@ -1439,6 +1455,14 @@ class Stream {
   // BlockHostUntilDone() is called.
   internal::TemporaryMemoryManager temporary_memory_manager_;
 
+  // Implementation of ThenConvolveBackwardBias that is shared by all types.
+  template <typename T>
+  Stream &ThenConvolveBackwardBiasImpl(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<T> &input_data,
+      const dnn::BatchDescriptor &bias_descriptor,
+      DeviceMemory<T> *backward_bias_data);
+
   SE_DISALLOW_COPY_AND_ASSIGN(Stream);
 };
 
diff --git a/tensorflow/tensorboard/backend/server.py b/tensorflow/tensorboard/backend/server.py
index cfdd6c56543..b025a2f5b9f 100644
--- a/tensorflow/tensorboard/backend/server.py
+++ b/tensorflow/tensorboard/backend/server.py
@@ -120,12 +120,9 @@ def StartMultiplexerReloadingThread(multiplexer, path_to_run, load_interval):
 
   Returns:
     A started `threading.Thread` that reloads the multiplexer.
-
   """
-  # Ensure the Multiplexer initializes in a loaded state before it adds runs
-  # So it can handle HTTP requests while runs are loading
-  multiplexer.Reload()
-
+  # We don't call multiplexer.Reload() here because that would make
+  # AddRunsFromDirectory block until the runs have all loaded.
   for path in path_to_run.keys():
     if gcs.IsGCSPath(path):
       gcs.CheckIsSupported()
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 03054238a24..4eb5619ecd7 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -321,6 +321,11 @@ def _cuda_copts():
                 "--cuda-gpu-arch=sm_35",
             ]
         ),
+    }) + select({
+        # Pass -O3 when building CUDA code with clang; some important
+        # optimizations are not enabled at O2.
+        "//third_party/gpus/cuda:using_clang_opt": ["-O3"],
+        "//conditions:default": [],
     })
 
 # Build defs for TensorFlow kernels
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 7c68fb763fa..b95f84ce5e4 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -13,8 +13,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.new_http_archive(
     name = "eigen_archive",
-    url = "https://bitbucket.org/eigen/eigen/get/a5e9085a94e8.tar.gz",
-    sha256 = "967126237829c7c87abb6cd0e13a5a235b0377d51575522c390b9486aed13e71",
+    url = "https://bitbucket.org/eigen/eigen/get/f3a13643ac1f.tar.gz",
+    sha256 = "a9266e60366cddb371a23d86b11a297eee86372a89ef4b38a3509012f9cc37ec",
     build_file = path_prefix + "eigen.BUILD",
   )
 
diff --git a/third_party/eigen3/Eigen/Cholesky b/third_party/eigen3/Eigen/Cholesky
index ca263316709..7b196a89043 100644
--- a/third_party/eigen3/Eigen/Cholesky
+++ b/third_party/eigen3/Eigen/Cholesky
@@ -1 +1 @@
-#include "eigen-eigen-a5e9085a94e8/Eigen/Cholesky"
+#include "eigen-eigen-f3a13643ac1f/Eigen/Cholesky"
diff --git a/third_party/eigen3/Eigen/Core b/third_party/eigen3/Eigen/Core
index 1e6ac595cc5..97361e51834 100644
--- a/third_party/eigen3/Eigen/Core
+++ b/third_party/eigen3/Eigen/Core
@@ -1 +1 @@
-#include "eigen-eigen-a5e9085a94e8/Eigen/Core"
+#include "eigen-eigen-f3a13643ac1f/Eigen/Core"
diff --git a/third_party/eigen3/Eigen/Eigenvalues b/third_party/eigen3/Eigen/Eigenvalues
index 480d9079b03..a5f98ed8702 100644
--- a/third_party/eigen3/Eigen/Eigenvalues
+++ b/third_party/eigen3/Eigen/Eigenvalues
@@ -1 +1 @@
-#include "eigen-eigen-a5e9085a94e8/Eigen/Eigenvalues"
+#include "eigen-eigen-f3a13643ac1f/Eigen/Eigenvalues"
diff --git a/third_party/eigen3/Eigen/LU b/third_party/eigen3/Eigen/LU
index 0e82ebb8fc9..5172aece6cf 100644
--- a/third_party/eigen3/Eigen/LU
+++ b/third_party/eigen3/Eigen/LU
@@ -1 +1 @@
-#include "eigen-eigen-a5e9085a94e8/Eigen/LU"
+#include "eigen-eigen-f3a13643ac1f/Eigen/LU"
diff --git a/third_party/eigen3/Eigen/QR b/third_party/eigen3/Eigen/QR
index 13562bca3cd..bd59f7adf20 100644
--- a/third_party/eigen3/Eigen/QR
+++ b/third_party/eigen3/Eigen/QR
@@ -1 +1 @@
-#include "eigen-eigen-a5e9085a94e8/Eigen/QR"
+#include "eigen-eigen-f3a13643ac1f/Eigen/QR"
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
index a9b263f5ae3..8d363c3845f 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -1 +1 @@
-#include "eigen-eigen-a5e9085a94e8/unsupported/Eigen/CXX11/Tensor"
+#include "eigen-eigen-f3a13643ac1f/unsupported/Eigen/CXX11/Tensor"
diff --git a/third_party/gpus/cuda/BUILD b/third_party/gpus/cuda/BUILD
index a0d1d6561b0..b68104385d6 100644
--- a/third_party/gpus/cuda/BUILD
+++ b/third_party/gpus/cuda/BUILD
@@ -31,6 +31,15 @@ config_setting(
     },
 )
 
+# Equivalent to using_clang && -c opt.
+config_setting(
+    name = "using_clang_opt",
+    values = {
+        "define": "using_cuda_clang=true",
+        "compilation_mode": "opt",
+    },
+)
+
 config_setting(
     name = "darwin",
     values = {"cpu": "darwin"},