Merge changes from github.

Change: 128401884
2016-07-25 13:48:16 -08:00 · 2016-07-25 13:48:16 -08:00 · 21716d8f6e
commit 21716d8f6e
parent ed281973d6
105 changed files with 2576 additions and 1144 deletions
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@ -18,7 +18,10 @@ If installed from binary pip package, provide:
 1. Which pip package you installed.
 2. The output from `python -c "import tensorflow; print(tensorflow.__version__)"`.
-If installed from sources, provide the commit hash:
+If installed from source, provide 
 1. The commit hash (`git rev-parse HEAD`)
 2. The output of `bazel version`
 ### Steps to reproduce
 1.
--- a/eigen.BUILD
+++ b/eigen.BUILD
@ -1,9 +1,8 @@
 package(default_visibility = ["//visibility:public"])
 archive_dir = "eigen-eigen-b4fa9622b809"
 cc_library(
    name = "eigen",
-    hdrs = glob([archive_dir+"/**/*.h", archive_dir+"/unsupported/Eigen/*", archive_dir+"/unsupported/Eigen/CXX11/*", archive_dir+"/Eigen/*"]),
+    hdrs = glob(["**/*.h", "unsupported/Eigen/*", "unsupported/Eigen/CXX11/*", "Eigen/*"]),
-    includes = [ archive_dir ],
+    includes = [ '.' ],
    visibility = ["//visibility:public"],
 )
--- a/gif.BUILD
+++ b/gif.BUILD
@ -0,0 +1,23 @@
 SOURCES = [
    "dgif_lib.c",
    "egif_lib.c",
    "gif_font.c",
    "gif_hash.c",
    "gifalloc.c",
    "openbsd-reallocarray.c",
    "gif_err.c",
    "quantize.c",
 ]
 prefix_dir = "giflib-5.1.4/lib"
 cc_library(
    name = "gif",
    srcs = [prefix_dir + "/" + source for source in SOURCES],
    hdrs = [prefix_dir + "/gif_lib.h"],
    includes = [prefix_dir],
    defines = [
        "HAVE_CONFIG_H",
    ],
    visibility = ["//visibility:public"],
 )
--- a/tensorflow/contrib/cmake/external/eigen.cmake
+++ b/tensorflow/contrib/cmake/external/eigen.cmake
@ -7,16 +7,30 @@
 include (ExternalProject)
-set(eigen_archive_hash "b4fa9622b809")
+# We parse the current Eigen version and archive hash from the bazel configuration
 file(STRINGS ${PROJECT_SOURCE_DIR}/../../workspace.bzl workspace_contents)
 foreach(line ${workspace_contents})
    string(REGEX MATCH ".*eigen_version.*=.*\"(.*)\"" has_version ${line})
    if(has_version)
        set(eigen_version ${CMAKE_MATCH_1})
        break()
    endif()
 endforeach()
 foreach(line ${workspace_contents})
    string(REGEX MATCH ".*eigen_sha256.*=.*\"(.*)\"" has_hash ${line})
    if(has_hash)
        set(eigen_hash ${CMAKE_MATCH_1})
        break()
    endif()
 endforeach()
 set(eigen_INCLUDE_DIRS
    ${CMAKE_CURRENT_BINARY_DIR}
    ${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive
    ${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive/eigen-eigen-${eigen_archive_hash}
    ${tensorflow_source_dir}/third_party/eigen3
 )
-set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
+set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_version}.tar.gz)
-set(eigen_HASH SHA256=2862840c2de9c0473a4ef20f8678949ae89ab25965352ee53329e63ba46cec62)
+set(eigen_HASH SHA256=${eigen_hash})
 set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
 set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)
@ -30,5 +44,5 @@ ExternalProject_Add(eigen
        -DCMAKE_BUILD_TYPE:STRING=Release
        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
        -DCMAKE_INSTALL_PREFIX:STRING=${eigen_INSTALL}
-        -DINCLUDE_INSTALL_DIR:STRING=${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive/eigen-eigen-${eigen_archive_hash}
+        -DINCLUDE_INSTALL_DIR:STRING=${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive
 )
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@ -55,12 +55,8 @@ class KMeansClustering(estimator.Estimator,
               distance_metric=clustering_ops.SQUARED_EUCLIDEAN_DISTANCE,
               random_seed=0,
               use_mini_batch=True,
               batch_size=128,
               steps=10,
               kmeans_plus_plus_num_retries=2,
-               continue_training=False,
+               config=None):
               config=None,
               verbose=1):
    """Creates a model for running KMeans training and inference.
    Args:
@ -73,25 +69,17 @@ class KMeansClustering(estimator.Estimator,
      random_seed: Python integer. Seed for PRNG used to initialize centers.
      use_mini_batch: If true, use the mini-batch k-means algorithm. Else assume
        full batch.
      batch_size: See TensorFlowEstimator
      steps: See TensorFlowEstimator
      kmeans_plus_plus_num_retries: For each point that is sampled during
        kmeans++ initialization, this parameter specifies the number of
        additional points to draw from the current distribution before selecting
        the best. If a negative value is specified, a heuristic is used to
        sample O(log(num_to_sample)) additional points.
-      continue_training: See TensorFlowEstimator
+      config: See Estimator
      config: See TensorFlowEstimator
      verbose: See TensorFlowEstimator
    """
    super(KMeansClustering, self).__init__(
        model_dir=model_dir,
        config=config)
    self.batch_size = batch_size
    self.steps = steps
    self.kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
    self.continue_training = continue_training
    self.verbose = verbose
    self._num_clusters = num_clusters
    self._training_initial_clusters = initial_clusters
    self._training_graph = None
@ -135,11 +123,11 @@ class KMeansClustering(estimator.Estimator,
      return relative_change < self._tolerance
 # pylint: enable=protected-access
-  def fit(self, x, y=None, monitors=None, logdir=None, steps=None,
+  def fit(self, x, y=None, monitors=None, logdir=None, steps=None, batch_size=128,
          relative_tolerance=None):
    """Trains a k-means clustering on x.
-    Note: See TensorFlowEstimator for logic for continuous training and graph
+    Note: See Estimator for logic for continuous training and graph
      construction across multiple calls to fit.
    Args:
@ -151,6 +139,7 @@ class KMeansClustering(estimator.Estimator,
        visualization.
      steps: number of training steps. If not None, overrides the value passed
        in constructor.
      batch_size: mini-batch size to use. Requires `use_mini_batch=True`.
      relative_tolerance: A relative tolerance of change in the loss between
        iterations.  Stops learning if the loss changes less than this amount.
        Note that this may not work correctly if use_mini_batch=True.
@ -162,7 +151,7 @@ class KMeansClustering(estimator.Estimator,
    if logdir is not None:
      self._model_dir = logdir
    self._data_feeder = data_feeder.setup_train_data_feeder(
-        x, None, self._num_clusters, self.batch_size)
+        x, None, self._num_clusters, batch_size if self._use_mini_batch else None)
    if relative_tolerance is not None:
      if monitors is not None:
        monitors += [self._StopWhenConverged(relative_tolerance)]
@ -173,7 +162,7 @@ class KMeansClustering(estimator.Estimator,
            or (self.steps is not None))
    self._train_model(input_fn=self._data_feeder.input_builder,
                      feed_fn=self._data_feeder.get_feed_dict_fn(),
-                      steps=steps or self.steps,
+                      steps=steps,
                      monitors=monitors,
                      init_feed_fn=self._data_feeder.get_feed_dict_fn())
    return self
--- a/tensorflow/contrib/factorization/python/ops/kmeans_test.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
@ -53,13 +53,14 @@ class KMeansTest(tf.test.TestCase):
    self.kmeans = KMeans(self.num_centers,
                         initial_clusters=kmeans_ops.RANDOM_INIT,
                         batch_size=self.batch_size,
                         use_mini_batch=self.use_mini_batch,
-                         steps=30,
+                         config=self.config(14),
                         continue_training=True,
                         config=run_config.RunConfig(tf_random_seed=14),
                         random_seed=12)
  @staticmethod
  def config(tf_random_seed):
    return run_config.RunConfig(tf_random_seed=tf_random_seed)
  @property
  def batch_size(self):
    return self.num_points
@ -86,7 +87,7 @@ class KMeansTest(tf.test.TestCase):
  def test_clusters(self):
    kmeans = self.kmeans
-    kmeans.fit(x=self.points, steps=0)
+    kmeans.fit(x=self.points, steps=1, batch_size=8)
    clusters = kmeans.clusters()
    self.assertAllEqual(list(clusters.shape),
                        [self.num_centers, self.num_dims])
@ -97,10 +98,11 @@ class KMeansTest(tf.test.TestCase):
      return
    kmeans = self.kmeans
    kmeans.fit(x=self.points,
-               steps=1)
+               steps=1, batch_size=self.batch_size)
    score1 = kmeans.score(x=self.points)
    kmeans.fit(x=self.points,
-               steps=15 * self.num_points // self.batch_size)
+               steps=15 * self.num_points // self.batch_size,
               batch_size=self.batch_size)
    score2 = kmeans.score(x=self.points)
    self.assertTrue(score1 > score2)
    self.assertNear(self.true_score, score2, self.true_score * 0.05)
@ -111,39 +113,36 @@ class KMeansTest(tf.test.TestCase):
      return
    kmeans = KMeans(self.num_centers,
                    initial_clusters=kmeans_ops.RANDOM_INIT,
                    batch_size=self.batch_size,
                    use_mini_batch=self.use_mini_batch,
                    # Force it to train forever until the monitor stops it.
                    steps=None,
                    continue_training=True,
                    config=run_config.RunConfig(tf_random_seed=14),
                    random_seed=12)
    kmeans.fit(x=self.points,
               # Force it to train forever until the monitor stops it.
               steps=None,
               batch_size=self.batch_size,
               relative_tolerance=1e-4)
    score = kmeans.score(x=self.points)
    self.assertNear(self.true_score, score, self.true_score * 0.005)
  def test_infer(self):
    kmeans = self.kmeans
-    kmeans.fit(x=self.points)
+    kmeans.fit(x=self.points, steps=10, batch_size=128)
    clusters = kmeans.clusters()
    # Make a small test set
    points, true_assignments, true_offsets = self.make_random_points(clusters,
                                                                     10)
    # Test predict
-    assignments = kmeans.predict(points)
+    assignments = kmeans.predict(points, batch_size=self.batch_size)
    self.assertAllEqual(assignments, true_assignments)
    # Test score
-    score = kmeans.score(points)
+    score = kmeans.score(points, batch_size=128)
    self.assertNear(score, np.sum(true_offsets), 0.01 * score)
    # Test transform
-    transform = kmeans.transform(points)
+    transform = kmeans.transform(points, batch_size=128)
    true_transform = np.maximum(
        0,
        np.sum(np.square(points), axis=1, keepdims=True) -
@ -161,12 +160,9 @@ class KMeansTest(tf.test.TestCase):
                    initial_clusters=kmeans_ops.RANDOM_INIT,
                    distance_metric=kmeans_ops.COSINE_DISTANCE,
                    use_mini_batch=self.use_mini_batch,
-                    batch_size=4,
+                    config=self.config(2),
                    steps=30,
                    continue_training=True,
                    config=run_config.RunConfig(tf_random_seed=2),
                    random_seed=12)
-    kmeans.fit(x=points)
+    kmeans.fit(x=points, steps=10, batch_size=4)
    centers = normalize(kmeans.clusters())
    self.assertAllClose(np.sort(centers, axis=0),
                        np.sort(true_centers, axis=0))
@ -184,10 +180,8 @@ class KMeansTest(tf.test.TestCase):
                    initial_clusters=kmeans_ops.RANDOM_INIT,
                    distance_metric=kmeans_ops.COSINE_DISTANCE,
                    use_mini_batch=self.use_mini_batch,
-                    batch_size=8,
+                    config=self.config(3))
-                    continue_training=True,
+    kmeans.fit(x=points, steps=30, batch_size=8)
                    config=run_config.RunConfig(tf_random_seed=3))
    kmeans.fit(x=points, steps=30)
    centers = normalize(kmeans.clusters())
    self.assertAllClose(np.sort(centers, axis=0),
@ -195,7 +189,7 @@ class KMeansTest(tf.test.TestCase):
                        atol=1e-2)
    true_transform = 1 - cosine_similarity(points, centers)
-    transform = kmeans.transform(points)
+    transform = kmeans.transform(points, batch_size=8)
    self.assertAllClose(transform, true_transform, atol=1e-3)
  def test_predict_with_cosine_distance(self):
@ -217,20 +211,18 @@ class KMeansTest(tf.test.TestCase):
                    initial_clusters=kmeans_ops.RANDOM_INIT,
                    distance_metric=kmeans_ops.COSINE_DISTANCE,
                    use_mini_batch=self.use_mini_batch,
-                    batch_size=8,
+                    config=self.config(3))
-                    continue_training=True,
+    kmeans.fit(x=points, steps=30, batch_size=8)
                    config=run_config.RunConfig(tf_random_seed=3))
    kmeans.fit(x=points, steps=30)
    centers = normalize(kmeans.clusters())
    self.assertAllClose(np.sort(centers, axis=0),
                        np.sort(true_centers, axis=0), atol=1e-2)
-    assignments = kmeans.predict(points)
+    assignments = kmeans.predict(points, batch_size=8)
    self.assertAllClose(centers[assignments],
                        true_centers[true_assignments], atol=1e-2)
-    score = kmeans.score(points)
+    score = kmeans.score(points, batch_size=8)
    self.assertAllClose(score, true_score, atol=1e-2)
  def test_predict_with_cosine_distance_and_kmeans_plus_plus(self):
@ -254,21 +246,19 @@ class KMeansTest(tf.test.TestCase):
                    initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT,
                    distance_metric=kmeans_ops.COSINE_DISTANCE,
                    use_mini_batch=self.use_mini_batch,
-                    batch_size=12,
+                    config=self.config(3))
-                    continue_training=True,
+    kmeans.fit(x=points, steps=30, batch_size=12)
                    config=run_config.RunConfig(tf_random_seed=3))
    kmeans.fit(x=points, steps=30)
    centers = normalize(kmeans.clusters())
    self.assertAllClose(sorted(centers.tolist()),
                        sorted(true_centers.tolist()),
                        atol=1e-2)
-    assignments = kmeans.predict(points)
+    assignments = kmeans.predict(points, batch_size=12)
    self.assertAllClose(centers[assignments],
                        true_centers[true_assignments], atol=1e-2)
-    score = kmeans.score(points)
+    score = kmeans.score(points, batch_size=12)
    self.assertAllClose(score, true_score, atol=1e-2)
  def test_fit_raise_if_num_clusters_larger_than_num_points_random_init(self):
@ -276,7 +266,7 @@ class KMeansTest(tf.test.TestCase):
    with self.assertRaisesOpError('less'):
      kmeans = KMeans(num_clusters=3, initial_clusters=kmeans_ops.RANDOM_INIT)
-      kmeans.fit(x=points)
+      kmeans.fit(x=points, steps=10, batch_size=8)
  def test_fit_raise_if_num_clusters_larger_than_num_points_kmeans_plus_plus(
      self):
@ -285,7 +275,7 @@ class KMeansTest(tf.test.TestCase):
    with self.assertRaisesOpError(AssertionError):
      kmeans = KMeans(num_clusters=3,
                      initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT)
-      kmeans.fit(x=points)
+      kmeans.fit(x=points, steps=10, batch_size=8)
 class MiniBatchKMeansTest(KMeansTest):
--- a/tensorflow/contrib/ios_examples/README.md
+++ b/tensorflow/contrib/ios_examples/README.md
@ -72,5 +72,14 @@ rundown:
   unused because no other code references the variables, but in fact their
   constructors have the important side effect of registering the class.
 - C++11 support (or later) should be enabled by setting `C++ Language Dialect` to
   `GNU++11` (or `GNU++14`), and `C++ Standard Library` to `libc++`.
 - The library doesn't currently support bitcode, so you'll need to disable that
   in your project settings.
 - Remove any use of the `-all_load` flag in your project. The protocol buffers
   libraries (full and lite versions) contain duplicate symbols, and the `-all_load`
   flag will cause these duplicates to become link errors. If you were using
   `-all_load` to avoid issues with Objective-C categories in static libraries,
   you may be able to replace it with the `-ObjC` flag.
--- a/tensorflow/contrib/learn/python/learn/estimators/classifier.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/classifier.py
@ -47,7 +47,9 @@ class Classifier(estimator.Estimator):
    Args:
      model_fn: (targets, predictions, mode) -> logits, loss, train_op
      n_classes: Number of classes
-      model_dir: Base directory for output data
+      model_dir: Directory to save model parameters, graph and etc. This can also
        be used to load checkpoints from the directory into a estimator to continue
        training a previously saved model.
      config: Configuration object (optional)
    """
    self._n_classes = n_classes
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@ -119,7 +119,9 @@ class DNNClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
      feature_columns: An iterable containing all the feature columns used by
        the model. All items in the set should be instances of classes derived
        from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
        be used to load checkpoints from the directory into a estimator to continue
        training a previously saved model.
      n_classes: number of target classes. Default is binary classification.
        It must be greater than 1.
      weight_column_name: A string defining feature column name representing
@ -277,7 +279,9 @@ class DNNRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
      feature_columns: An iterable containing all the feature columns used by
        the model. All items in the set should be instances of classes derived
        from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
        be used to load checkpoints from the directory into a estimator to continue
        training a previously saved model.
      weight_column_name: A string defining feature column name representing
        weights. It is used to down weight or boost examples during training. It
        will be multiplied by the loss of the example.
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@ -72,7 +72,9 @@ class _DNNLinearCombinedBaseEstimator(estimator.BaseEstimator):
    Args:
      target_column: A _TargetColumn object.
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
        be used to load checkpoints from the directory into a estimator to continue
        training a previously saved model.
      linear_feature_columns: An iterable containing all the feature columns
        used by linear part of the model. All items in the set should be
        instances of classes derived from `FeatureColumn`.
@ -354,7 +356,9 @@ class DNNLinearCombinedClassifier(_DNNLinearCombinedBaseEstimator):
    """Constructs a DNNLinearCombinedClassifier instance.
    Args:
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
        be used to load checkpoints from the directory into a estimator to continue
        training a previously saved model.
      n_classes: number of target classes. Default is binary classification.
      weight_column_name: A string defining feature column name representing
        weights. It is used to down weight or boost examples during training.
@ -537,7 +541,9 @@ class DNNLinearCombinedRegressor(_DNNLinearCombinedBaseEstimator):
    """Initializes a DNNLinearCombinedRegressor instance.
    Args:
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
        be used to load checkpoints from the directory into a estimator to continue
        training a previously saved model.
      weight_column_name: A string defining feature column name representing
        weights. It is used to down weight or boost examples during training. It
        will be multiplied by the loss of the example.
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@ -158,7 +158,9 @@ class BaseEstimator(sklearn.BaseEstimator):
    """Initializes a BaseEstimator instance.
    Args:
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
        be used to load checkpoints from the directory into a estimator to continue
        training a previously saved model.
      config: A RunConfig instance.
    """
    # Model directory.
@ -766,7 +768,9 @@ class Estimator(BaseEstimator):
                 is passed to Estimator in `params` parameter. This allows
                 to configure Estimators from hyper parameter tunning.
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
        be used to load checkpoints from the directory into a estimator to continue
        training a previously saved model.
      config: Configuration object.
      params: `dict` of hyper parameters that will be passed into `model_fn`.
              Keys are names of parameters, values are basic python types.
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@ -122,7 +122,9 @@ class LinearClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
      feature_columns: An iterable containing all the feature columns used by
        the model. All items in the set should be instances of classes derived
        from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
        be used to load checkpoints from the directory into a estimator to continue
        training a previously saved model.
      n_classes: number of target classes. Default is binary classification.
      weight_column_name: A string defining feature column name representing
        weights. It is used to down weight or boost examples during training. It
@ -280,7 +282,9 @@ class LinearRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
      feature_columns: An iterable containing all the feature columns used by
        the model. All items in the set should be instances of classes derived
        from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph, etc.
+      model_dir: Directory to save model parameters, graph, etc. This can also
        be used to load checkpoints from the directory into a estimator to continue
        training a previously saved model.
      weight_column_name: A string defining feature column name representing
        weights. It is used to down weight or boost examples during training. It
        will be multiplied by the loss of the example.
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
@ -57,7 +57,9 @@ class LogisticRegressor(estimator.Estimator):
        expects the returned predictions to be probabilities in [0.0, 1.0].
      thresholds: List of floating point thresholds to use for accuracy,
        precision, and recall metrics. If None, defaults to [0.5].
-      model_dir: Directory to save model parameters, graphs, etc.
+      model_dir: Directory to save model parameters, graphs, etc. This can also
        be used to load checkpoints from the directory into a estimator to continue
        training a previously saved model.
      config: A RunConfig configuration object.
    """
    if thresholds is None:
--- a/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
@ -69,8 +69,7 @@ class TensorForestEstimator(estimator.BaseEstimator):
  def __init__(self, params, device_assigner=None, model_dir=None,
               graph_builder_class=tensor_forest.RandomForestGraphs,
               master='', accuracy_metric=None,
-               tf_random_seed=None, continue_training=False, verbose=1,
+               tf_random_seed=None, config=None):
               max_to_keep=5, save_checkpoint_secs=300):
    self.params = params.fill()
    self.accuracy_metric = (accuracy_metric or
                            ('r2' if self.params.regression else 'accuracy'))
@ -81,12 +80,6 @@ class TensorForestEstimator(estimator.BaseEstimator):
    self.training_args = {}
    self.construction_args = {}
    config = run_config.RunConfig(
        master=master,
        tf_random_seed=(tf_random_seed or int((time.time() * 1000) % 1000)),
        save_checkpoints_secs=save_checkpoint_secs,
        keep_checkpoint_max=max_to_keep)
    super(TensorForestEstimator, self).__init__(model_dir=model_dir,
                                                config=config)
--- a/tensorflow/contrib/learn/python/learn/estimators/svm.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/svm.py
@ -74,7 +74,9 @@ class SVM(linear.LinearClassifier):
    weight_column_name: A string defining feature column name representing
      weights. It is used to down weight or boost examples during training. It
      will be multiplied by the loss of the example.
-    model_dir: Directory to save model parameters, graph and etc.
+    model_dir: Directory to save model parameters, graph and etc. This can also
        be used to load checkpoints from the directory into a estimator to continue
        training a previously saved model.
    l1_regularization: L1-regularization parameter
    l2_regularization: L2-regularization parameter
    kernels: A list of kernels for the SVM. Currently, no kernels are supported.
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@ -38,29 +38,29 @@ HOST_OBJDIR := $(MAKEFILE_DIR)/gen/host_obj/
 HOST_BINDIR := $(MAKEFILE_DIR)/gen/host_bin/
 HOST_GENDIR := $(MAKEFILE_DIR)/gen/host_obj/
-# Find the current Eigen version name from the Bazel build file
+# Find the current Eigen version from the Bazel configuration
-EIGEN_HASH := $(shell cat eigen.BUILD | grep archive_dir | head -1 | cut -f3 -d- | cut -f1 -d\")
+EIGEN_VERSION := $(shell grep eigen_version tensorflow/workspace.bzl | head -1 | sed -e 's/.*eigen_version.*=.*"\(.*\)"/\1/')
 # Settings for the host compiler.
 HOST_CXX := $(CC_PREFIX) gcc
 HOST_CXXFLAGS := --std=c++11
-HOST_LDOPTS := \
+HOST_LDOPTS := 
 -L/usr/local/lib
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_LDOPTS += -L$(MAKEFILE_DIR)/gen/protobuf-host/lib
 endif
 HOST_LDOPTS += -L/usr/local/lib
 HOST_INCLUDES := \
 -I/usr/local/include \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_HASH) \
+-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_VERSION) \
 -I$(HOST_GENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include
 endif
 # This is at the end so any globally-installed frameworks like protobuf don't
 # override local versions in the source tree.
 HOST_INCLUDES += -I/usr/local/include
 HOST_LIBS := \
 -lstdc++ \
@ -120,21 +120,18 @@ CXXFLAGS := --std=c++11 -DIS_SLIM_BUILD $(OPTFLAGS)
 LDFLAGS := \
 -L/usr/local/lib
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_LDOPTS += -L$(MAKEFILE_DIR)/gen/protobuf-host/lib
 endif
 INCLUDES := \
 -I/usr/local/include \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_HASH) \
+-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_VERSION) \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include
 endif
 # This is at the end so any globally-installed frameworks like protobuf don't
 # override local versions in the source tree.
 INCLUDES += -I/usr/local/include
 LIBS := \
 -lstdc++ \
@ -211,7 +208,7 @@ ifeq ($(TARGET),ANDROID)
 -I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi/include \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_HASH) \
+-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_VERSION) \
 -I$(MAKEFILE_DIR)/gen/protobuf/include \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
@ -364,7 +361,52 @@ BENCHMARK_NAME := $(BINDIR)benchmark
 # What sources we want to compile, derived from the main Bazel build using the
 # gen_file_lists.sh script.
-TF_CC_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_cc_files.txt)
+
 CORE_CC_ALL_SRCS := \
 $(wildcard tensorflow/core/*.cc) \
 $(wildcard tensorflow/core/common_runtime/*.cc) \
 $(wildcard tensorflow/core/debug/*.cc) \
 $(wildcard tensorflow/core/framework/*.cc) \
 $(wildcard tensorflow/core/graph/*.cc) \
 $(wildcard tensorflow/core/lib/*/*.cc) \
 $(wildcard tensorflow/core/platform/*.cc) \
 $(wildcard tensorflow/core/platform/*/*.cc) \
 $(wildcard tensorflow/core/util/*.cc) \
 $(wildcard tensorflow/core/util/*/*.cc)
 CORE_CC_EXCLUDE_SRCS := \
 $(wildcard tensorflow/core/*/*test.cc) \
 $(wildcard tensorflow/core/*/*testutil*) \
 $(wildcard tensorflow/core/*/*testlib*) \
 $(wildcard tensorflow/core/*/*main.cc) \
 $(wildcard tensorflow/core/*/*/*test.cc) \
 $(wildcard tensorflow/core/*/*/*testutil*) \
 $(wildcard tensorflow/core/*/*/*testlib*) \
 $(wildcard tensorflow/core/*/*/*main.cc) \
 $(wildcard tensorflow/core/graph/dot.*) \
 $(wildcard tensorflow/core/lib/gif/*) \
 $(wildcard tensorflow/core/lib/jpeg/*) \
 $(wildcard tensorflow/core/lib/png/*) \
 $(wildcard tensorflow/core/util/checkpoint_reader.*) \
 $(wildcard tensorflow/core/util/events_writer.*) \
 $(wildcard tensorflow/core/util/reporter.*) \
 $(wildcard tensorflow/core/util/tf_status_helper.*) \
 $(wildcard tensorflow/core/platform/default/stream_executor.*) \
 $(wildcard tensorflow/core/platform/default/test_benchmark.*) \
 $(wildcard tensorflow/core/platform/cuda.h) \
 $(wildcard tensorflow/core/platform/cloud/*) \
 $(wildcard tensorflow/core/platform/google/*) \
 $(wildcard tensorflow/core/platform/jpeg.*) \
 $(wildcard tensorflow/core/platform/png.*) \
 $(wildcard tensorflow/core/platform/stream_executor.*) \
 $(wildcard tensorflow/core/user_ops/*.cu.cc) \
 $(wildcard tensorflow/core/common_runtime/gpu/*) \
 $(wildcard tensorflow/core/common_runtime/gpu_device_factory.*)
 # Filter out all the excluded files.
 TF_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 # Add in any extra files that don't fit the patterns easily
 TF_CC_SRCS += tensorflow/core/common_runtime/gpu/gpu_tracer.cc
 # Also include the op and kernel definitions.
 TF_CC_SRCS += $(shell cat $(MAKEFILE_DIR)/tf_op_files.txt)
 PBT_CC_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_pb_text_files.txt)
 PROTO_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_proto_files.txt)
 BENCHMARK_SRCS := \
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@ -16,15 +16,15 @@ This static library will not contain:
 - Python or other language bindings
 - GPU support
- You can target:
+You can target:
- - iOS
+- iOS
- - OS X (macOS)
+- OS X (macOS)
- - Android
+- Android
- - Raspberry-PI
+- Raspberry-PI
- You will compile tensorflow and protobuf libraries that you can link into other
+You will compile tensorflow and protobuf libraries that you can link into other
- applications.  You will also compile the [benchmark](../../tools/benchmark/)
+applications.  You will also compile the [benchmark](../../tools/benchmark/)
- application that will let you check your application.
+application that will let you check your application.
 ## Before you start (all platforms)
@ -176,15 +176,16 @@ curl -o ~/graphs/inception.zip \
 ### Building all at once
-If you just want to get the libraries compiled in a hurry, you can run:
+If you just want to get the libraries compiled in a hurry, you can run this
 from the root of your TensorFlow source folder:
 ```bash
-build_all_ios.sh
+tensorflow/contrib/makefile/build_all_ios.sh
 ```
-and wait a long time.
+This process will take around twenty minutes on a modern MacBook Pro.
-When this completes, you will have a library for a single architecture and the
+When it completes, you will have a library for a single architecture and the
 benchmark program. Although successfully compiling the benchmark program is a
 sign of success, the program is not a complete iOS app.
@ -284,6 +285,17 @@ make -f tensorflow/contrib/makefile/Makefile HOST_OS=PI TARGET=PI \
 OPTFLAGS="-Os -mfpu=neon-vfpv4 -funsafe-math-optimizations -ftree-vectorize"
 ```
 If you hit compilation errors mentioning `__atomic_compare_exchange` and you're
 using gcc 4.9, you should try installing gcc 4.8 and using that instead:
 ```bash
 sudo apt-get install -y gcc-4.8 g++-4.8
 make -f tensorflow/contrib/makefile/Makefile HOST_OS=PI TARGET=PI \
 OPTFLAGS="-Os -mfpu=neon-vfpv4 -funsafe-math-optimizations -ftree-vectorize" \
 CXX=g++-4.8
 ```
 # Other notes
 ## Supported Systems
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@ -1,4 +1,4 @@
-#!/bin/bash -x
+#!/bin/bash -ex
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -15,11 +15,22 @@
 # ==============================================================================
 DOWNLOADS_DIR=tensorflow/contrib/makefile/downloads
 BZL_FILE_PATH=tensorflow/workspace.bzl
-mkdir ${DOWNLOADS_DIR}
+mkdir -p ${DOWNLOADS_DIR}
 # Grab the current Eigen version name from the Bazel build file
-EIGEN_HASH=$(cat eigen.BUILD | grep archive_dir | head -1 | cut -f3 -d- | cut -f1 -d\")
+EIGEN_HASH=$(cat "${BZL_FILE_PATH}" | egrep "eigen_version.*=.*\".*\"" | awk '{ print $3 }')
 # Trim trailing and preceding double quotes
 EIGEN_HASH="${EIGEN_HASH%\"}"
 EIGEN_HASH="${EIGEN_HASH#\"}"
 if [[ -z "${EIGEN_HASH}" ]]; then
    echo >&2 "Eigen hash does not exist."
    exit 1
 else
    echo "Eigen hash = ${EIGEN_HASH}"
 fi
 curl "https://bitbucket.org/eigen/eigen/get/${EIGEN_HASH}.tar.gz" \
 -o /tmp/eigen-${EIGEN_HASH}.tar.gz
@ -34,3 +45,5 @@ git clone https://github.com/google/protobuf.git ${DOWNLOADS_DIR}/protobuf
 cd ${DOWNLOADS_DIR}
 rm -rf eigen-latest
 ln -s eigen-eigen-${EIGEN_HASH} eigen-latest
 echo "download_dependencies.sh completed successfully."
--- a/tensorflow/contrib/makefile/gen_file_lists.sh
+++ b/tensorflow/contrib/makefile/gen_file_lists.sh
@ -16,16 +16,6 @@
 # This script generates the source file lists needed by the makefile by querying
 # the master Bazel build configuration.
 bazel query 'kind("source file", deps(//tensorflow/core:android_tensorflow_lib))' | \
 grep "//tensorflow/.*\.cc$" | \
 grep -v "gen_proto_text" | \
 grep -E -v "jpeg" | \
 grep -E -v "png" | \
 grep -E -v "zlib" | \
 sed -E 's#^//##g' | \
 sed -E 's#:#/#g' \
 > tensorflow/contrib/makefile/tf_cc_files.txt
 bazel query 'kind("source file", deps(//tensorflow/core:android_tensorflow_lib))' | \
 grep "//tensorflow/.*\.proto$" | \
 sed -E 's#^//##g' | \
--- a/tensorflow/contrib/makefile/tf_cc_files.txt
+++ b/tensorflow/contrib/makefile/tf_cc_files.txt
@ -1,264 +0,0 @@
 tensorflow/core/kernels/xent_op.cc
 tensorflow/core/kernels/where_op.cc
 tensorflow/core/kernels/variable_ops.cc
 tensorflow/core/kernels/unpack_op.cc
 tensorflow/core/kernels/transpose_op.cc
 tensorflow/core/kernels/transpose_functor_cpu.cc
 tensorflow/core/kernels/training_ops.cc
 tensorflow/core/kernels/topk_op.cc
 tensorflow/core/kernels/tile_ops.cc
 tensorflow/core/kernels/strided_slice_op.cc
 tensorflow/core/kernels/stack_ops.cc
 tensorflow/core/kernels/split_op.cc
 tensorflow/core/kernels/split_lib_cpu.cc
 tensorflow/core/kernels/sparse_to_dense_op.cc
 tensorflow/core/kernels/softsign_op.cc
 tensorflow/core/kernels/softplus_op.cc
 tensorflow/core/kernels/softmax_op.cc
 tensorflow/core/kernels/slice_op.cc
 tensorflow/core/kernels/shape_ops.cc
 tensorflow/core/kernels/session_ops.cc
 tensorflow/core/kernels/sequence_ops.cc
 tensorflow/core/kernels/sendrecv_ops.cc
 tensorflow/core/kernels/save_restore_tensor.cc
 tensorflow/core/kernels/save_op.cc
 tensorflow/core/kernels/reverse_sequence_op.cc
 tensorflow/core/kernels/reverse_op.cc
 tensorflow/core/kernels/restore_op.cc
 tensorflow/core/kernels/resize_nearest_neighbor_op.cc
 tensorflow/core/kernels/resize_bilinear_op.cc
 tensorflow/core/kernels/reshape_op.cc
 tensorflow/core/kernels/relu_op.cc
 tensorflow/core/kernels/reduction_ops_sum.cc
 tensorflow/core/kernels/reduction_ops_prod.cc
 tensorflow/core/kernels/reduction_ops_min.cc
 tensorflow/core/kernels/reduction_ops_mean.cc
 tensorflow/core/kernels/reduction_ops_max.cc
 tensorflow/core/kernels/reduction_ops_common.cc
 tensorflow/core/kernels/pooling_ops_common.cc
 tensorflow/core/kernels/pad_op.cc
 tensorflow/core/kernels/pack_op.cc
 tensorflow/core/kernels/ops_util.cc
 tensorflow/core/kernels/no_op.cc
 tensorflow/core/kernels/maxpooling_op.cc
 tensorflow/core/kernels/matmul_op.cc
 tensorflow/core/kernels/lrn_op.cc
 tensorflow/core/kernels/in_topk_op.cc
 tensorflow/core/kernels/immutable_constant_op.cc
 tensorflow/core/kernels/identity_op.cc
 tensorflow/core/kernels/gather_op.cc
 tensorflow/core/kernels/fill_functor.cc
 tensorflow/core/kernels/example_parsing_ops.cc
 tensorflow/core/kernels/dynamic_stitch_op.cc
 tensorflow/core/kernels/dynamic_partition_op.cc
 tensorflow/core/kernels/dense_update_ops.cc
 tensorflow/core/kernels/cwise_ops_common.cc
 tensorflow/core/kernels/cwise_op_tanh.cc
 tensorflow/core/kernels/cwise_op_sub.cc
 tensorflow/core/kernels/cwise_op_squared_difference.cc
 tensorflow/core/kernels/cwise_op_square.cc
 tensorflow/core/kernels/cwise_op_sqrt.cc
 tensorflow/core/kernels/cwise_op_sigmoid.cc
 tensorflow/core/kernels/cwise_op_select.cc
 tensorflow/core/kernels/cwise_op_rsqrt.cc
 tensorflow/core/kernels/cwise_op_neg.cc
 tensorflow/core/kernels/cwise_op_mul.cc
 tensorflow/core/kernels/cwise_op_minimum.cc
 tensorflow/core/kernels/cwise_op_maximum.cc
 tensorflow/core/kernels/cwise_op_log.cc
 tensorflow/core/kernels/cwise_op_less.cc
 tensorflow/core/kernels/cwise_op_isfinite.cc
 tensorflow/core/kernels/cwise_op_inverse.cc
 tensorflow/core/kernels/cwise_op_greater.cc
 tensorflow/core/kernels/cwise_op_exp.cc
 tensorflow/core/kernels/cwise_op_equal_to.cc
 tensorflow/core/kernels/cwise_op_div.cc
 tensorflow/core/kernels/cwise_op_add.cc
 tensorflow/core/kernels/ctc_decoder_ops.cc
 tensorflow/core/kernels/conv_ops.cc
 tensorflow/core/kernels/conv_grad_ops.cc
 tensorflow/core/kernels/control_flow_ops.cc
 tensorflow/core/kernels/constant_op.cc
 tensorflow/core/kernels/concat_op.cc
 tensorflow/core/kernels/concat_lib_cpu.cc
 tensorflow/core/kernels/check_numerics_op.cc
 tensorflow/core/kernels/cast_op.cc
 tensorflow/core/kernels/bias_op.cc
 tensorflow/core/kernels/bcast_ops.cc
 tensorflow/core/kernels/batch_norm_op.cc
 tensorflow/core/kernels/avgpooling_op.cc
 tensorflow/core/kernels/argmax_op.cc
 tensorflow/core/kernels/aggregate_ops.cc
 tensorflow/core/util/work_sharder.cc
 tensorflow/core/util/util.cc
 tensorflow/core/util/use_cudnn.cc
 tensorflow/core/util/tensor_slice_writer.cc
 tensorflow/core/util/tensor_slice_set.cc
 tensorflow/core/util/tensor_slice_reader_cache.cc
 tensorflow/core/util/tensor_slice_reader.cc
 tensorflow/core/util/tensor_format.cc
 tensorflow/core/util/stat_summarizer.cc
 tensorflow/core/util/sparse/group_iterator.cc
 tensorflow/core/util/saved_tensor_slice_util.cc
 tensorflow/core/util/port.cc
 tensorflow/core/util/padding.cc
 tensorflow/core/util/mirror_pad_mode.cc
 tensorflow/core/util/memmapped_file_system_writer.cc
 tensorflow/core/util/memmapped_file_system.cc
 tensorflow/core/util/guarded_philox_random.cc
 tensorflow/core/util/example_proto_helper.cc
 tensorflow/core/util/device_name_utils.cc
 tensorflow/core/util/command_line_flags.cc
 tensorflow/core/util/bcast.cc
 tensorflow/core/platform/tracing.cc
 tensorflow/core/platform/tensor_coding.cc
 tensorflow/core/platform/protobuf_util.cc
 tensorflow/core/platform/posix/posix_file_system.cc
 tensorflow/core/platform/posix/port.cc
 tensorflow/core/platform/posix/env.cc
 tensorflow/core/platform/load_library.cc
 tensorflow/core/platform/file_system.cc
 tensorflow/core/platform/env.cc
 tensorflow/core/platform/denormal.cc
 tensorflow/core/platform/default/tracing.cc
 tensorflow/core/platform/default/logging.cc
 tensorflow/core/ops/training_ops.cc
 tensorflow/core/ops/string_ops.cc
 tensorflow/core/ops/state_ops.cc
 tensorflow/core/ops/sparse_ops.cc
 tensorflow/core/ops/sendrecv_ops.cc
 tensorflow/core/ops/script_ops.cc
 tensorflow/core/ops/random_ops.cc
 tensorflow/core/ops/random_grad.cc
 tensorflow/core/ops/parsing_ops.cc
 tensorflow/core/ops/no_op.cc
 tensorflow/core/ops/nn_ops.cc
 tensorflow/core/ops/nn_grad.cc
 tensorflow/core/ops/math_ops.cc
 tensorflow/core/ops/math_grad.cc
 tensorflow/core/ops/logging_ops.cc
 tensorflow/core/ops/linalg_ops.cc
 tensorflow/core/ops/io_ops.cc
 tensorflow/core/ops/image_ops.cc
 tensorflow/core/ops/functional_ops.cc
 tensorflow/core/ops/functional_grad.cc
 tensorflow/core/ops/function_ops.cc
 tensorflow/core/ops/data_flow_ops.cc
 tensorflow/core/ops/ctc_ops.cc
 tensorflow/core/ops/control_flow_ops.cc
 tensorflow/core/ops/candidate_sampling_ops.cc
 tensorflow/core/ops/array_ops.cc
 tensorflow/core/ops/array_grad.cc
 tensorflow/core/lib/wav/wav_io.cc
 tensorflow/core/lib/strings/stringprintf.cc
 tensorflow/core/lib/strings/strcat.cc
 tensorflow/core/lib/strings/str_util.cc
 tensorflow/core/lib/strings/scanner.cc
 tensorflow/core/lib/strings/proto_text_util.cc
 tensorflow/core/lib/strings/ordered_code.cc
 tensorflow/core/lib/strings/numbers.cc
 tensorflow/core/lib/random/weighted_picker.cc
 tensorflow/core/lib/random/simple_philox.cc
 tensorflow/core/lib/random/random.cc
 tensorflow/core/lib/random/distribution_sampler.cc
 tensorflow/core/lib/io/two_level_iterator.cc
 tensorflow/core/lib/io/table_builder.cc
 tensorflow/core/lib/io/table.cc
 tensorflow/core/lib/io/record_writer.cc
 tensorflow/core/lib/io/record_reader.cc
 tensorflow/core/lib/io/path.cc
 tensorflow/core/lib/io/match.cc
 tensorflow/core/lib/io/iterator.cc
 tensorflow/core/lib/io/inputbuffer.cc
 tensorflow/core/lib/io/format.cc
 tensorflow/core/lib/io/block_builder.cc
 tensorflow/core/lib/io/block.cc
 tensorflow/core/lib/histogram/histogram.cc
 tensorflow/core/lib/hash/hash.cc
 tensorflow/core/lib/hash/crc32c.cc
 tensorflow/core/lib/core/threadpool.cc
 tensorflow/core/lib/core/stringpiece.cc
 tensorflow/core/lib/core/status.cc
 tensorflow/core/lib/core/coding.cc
 tensorflow/core/lib/core/arena.cc
 tensorflow/core/graph/validate.cc
 tensorflow/core/graph/tensor_id.cc
 tensorflow/core/graph/subgraph.cc
 tensorflow/core/graph/quantize_training.cc
 tensorflow/core/graph/optimizer_cse.cc
 tensorflow/core/graph/node_builder.cc
 tensorflow/core/graph/graph_partition.cc
 tensorflow/core/graph/graph_def_builder.cc
 tensorflow/core/graph/graph_constructor.cc
 tensorflow/core/graph/graph.cc
 tensorflow/core/graph/gradients.cc
 tensorflow/core/graph/equal_graph_def.cc
 tensorflow/core/graph/edgeset.cc
 tensorflow/core/graph/costmodel.cc
 tensorflow/core/graph/colors.cc
 tensorflow/core/graph/algorithm.cc
 tensorflow/core/framework/versions.cc
 tensorflow/core/framework/unique_tensor_references.cc
 tensorflow/core/framework/types.cc
 tensorflow/core/framework/tracking_allocator.cc
 tensorflow/core/framework/tensor_util.cc
 tensorflow/core/framework/tensor_slice.cc
 tensorflow/core/framework/tensor_shape.cc
 tensorflow/core/framework/tensor_reference.cc
 tensorflow/core/framework/tensor.cc
 tensorflow/core/framework/shape_inference.cc
 tensorflow/core/framework/resource_mgr.cc
 tensorflow/core/framework/rendezvous.cc
 tensorflow/core/framework/reader_op_kernel.cc
 tensorflow/core/framework/partial_tensor_shape.cc
 tensorflow/core/framework/op_segment.cc
 tensorflow/core/framework/op_kernel.cc
 tensorflow/core/framework/op_gen_lib.cc
 tensorflow/core/framework/op_def_util.cc
 tensorflow/core/framework/op_def_builder.cc
 tensorflow/core/framework/op.cc
 tensorflow/core/framework/node_def_util.cc
 tensorflow/core/framework/node_def_builder.cc
 tensorflow/core/framework/memory_types.cc
 tensorflow/core/framework/lookup_interface.cc
 tensorflow/core/framework/log_memory.cc
 tensorflow/core/framework/load_library.cc
 tensorflow/core/framework/kernel_def_builder.cc
 tensorflow/core/framework/graph_def_util.cc
 tensorflow/core/framework/function.cc
 tensorflow/core/framework/fake_input.cc
 tensorflow/core/framework/device_base.cc
 tensorflow/core/framework/common_shape_fns.cc
 tensorflow/core/framework/cancellation.cc
 tensorflow/core/framework/bfloat16.cc
 tensorflow/core/framework/attr_value_util.cc
 tensorflow/core/framework/allocator.cc
 tensorflow/core/common_runtime/threadpool_device_factory.cc
 tensorflow/core/common_runtime/threadpool_device.cc
 tensorflow/core/common_runtime/step_stats_collector.cc
 tensorflow/core/common_runtime/simple_placer.cc
 tensorflow/core/common_runtime/simple_graph_execution_state.cc
 tensorflow/core/common_runtime/session_state.cc
 tensorflow/core/common_runtime/session_options.cc
 tensorflow/core/common_runtime/session_factory.cc
 tensorflow/core/common_runtime/session.cc
 tensorflow/core/common_runtime/rendezvous_mgr.cc
 tensorflow/core/common_runtime/process_util.cc
 tensorflow/core/common_runtime/memory_types.cc
 tensorflow/core/common_runtime/local_device.cc
 tensorflow/core/common_runtime/graph_optimizer.cc
 tensorflow/core/common_runtime/gpu/gpu_tracer.cc
 tensorflow/core/common_runtime/function.cc
 tensorflow/core/common_runtime/executor.cc
 tensorflow/core/common_runtime/direct_session.cc
 tensorflow/core/common_runtime/device_set.cc
 tensorflow/core/common_runtime/device_mgr.cc
 tensorflow/core/common_runtime/device_factory.cc
 tensorflow/core/common_runtime/device.cc
 tensorflow/core/common_runtime/costmodel_manager.cc
 tensorflow/core/common_runtime/copy_tensor.cc
 tensorflow/core/common_runtime/constant_folding.cc
 tensorflow/core/common_runtime/build_graph_options.cc
 tensorflow/core/common_runtime/bfc_allocator.cc
 tensorflow/core/common_runtime/allocator_retry.cc
 tensorflow/core/client/tensor_c_api.cc
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@ -0,0 +1,124 @@
 tensorflow/core/kernels/xent_op.cc
 tensorflow/core/kernels/where_op.cc
 tensorflow/core/kernels/variable_ops.cc
 tensorflow/core/kernels/unpack_op.cc
 tensorflow/core/kernels/transpose_op.cc
 tensorflow/core/kernels/transpose_functor_cpu.cc
 tensorflow/core/kernels/training_ops.cc
 tensorflow/core/kernels/topk_op.cc
 tensorflow/core/kernels/tile_ops.cc
 tensorflow/core/kernels/strided_slice_op_inst_6.cc
 tensorflow/core/kernels/strided_slice_op_inst_5.cc
 tensorflow/core/kernels/strided_slice_op_inst_4.cc
 tensorflow/core/kernels/strided_slice_op_inst_3.cc
 tensorflow/core/kernels/strided_slice_op_inst_2.cc
 tensorflow/core/kernels/strided_slice_op_inst_1.cc
 tensorflow/core/kernels/strided_slice_op.cc
 tensorflow/core/kernels/stack_ops.cc
 tensorflow/core/kernels/split_op.cc
 tensorflow/core/kernels/split_lib_cpu.cc
 tensorflow/core/kernels/sparse_to_dense_op.cc
 tensorflow/core/kernels/softsign_op.cc
 tensorflow/core/kernels/softplus_op.cc
 tensorflow/core/kernels/softmax_op.cc
 tensorflow/core/kernels/slice_op.cc
 tensorflow/core/kernels/shape_ops.cc
 tensorflow/core/kernels/session_ops.cc
 tensorflow/core/kernels/sequence_ops.cc
 tensorflow/core/kernels/sendrecv_ops.cc
 tensorflow/core/kernels/save_restore_tensor.cc
 tensorflow/core/kernels/save_op.cc
 tensorflow/core/kernels/reverse_sequence_op.cc
 tensorflow/core/kernels/reverse_op.cc
 tensorflow/core/kernels/restore_op.cc
 tensorflow/core/kernels/resize_nearest_neighbor_op.cc
 tensorflow/core/kernels/resize_bilinear_op.cc
 tensorflow/core/kernels/reshape_op.cc
 tensorflow/core/kernels/relu_op.cc
 tensorflow/core/kernels/reduction_ops_sum.cc
 tensorflow/core/kernels/reduction_ops_prod.cc
 tensorflow/core/kernels/reduction_ops_min.cc
 tensorflow/core/kernels/reduction_ops_mean.cc
 tensorflow/core/kernels/reduction_ops_max.cc
 tensorflow/core/kernels/reduction_ops_common.cc
 tensorflow/core/kernels/pooling_ops_common.cc
 tensorflow/core/kernels/pad_op.cc
 tensorflow/core/kernels/pack_op.cc
 tensorflow/core/kernels/ops_util.cc
 tensorflow/core/kernels/no_op.cc
 tensorflow/core/kernels/maxpooling_op.cc
 tensorflow/core/kernels/matmul_op.cc
 tensorflow/core/kernels/lrn_op.cc
 tensorflow/core/kernels/in_topk_op.cc
 tensorflow/core/kernels/immutable_constant_op.cc
 tensorflow/core/kernels/identity_op.cc
 tensorflow/core/kernels/gather_op.cc
 tensorflow/core/kernels/fill_functor.cc
 tensorflow/core/kernels/example_parsing_ops.cc
 tensorflow/core/kernels/dynamic_stitch_op.cc
 tensorflow/core/kernels/dynamic_partition_op.cc
 tensorflow/core/kernels/dense_update_ops.cc
 tensorflow/core/kernels/cwise_ops_common.cc
 tensorflow/core/kernels/cwise_op_tanh.cc
 tensorflow/core/kernels/cwise_op_sub.cc
 tensorflow/core/kernels/cwise_op_squared_difference.cc
 tensorflow/core/kernels/cwise_op_square.cc
 tensorflow/core/kernels/cwise_op_sqrt.cc
 tensorflow/core/kernels/cwise_op_sigmoid.cc
 tensorflow/core/kernels/cwise_op_select.cc
 tensorflow/core/kernels/cwise_op_rsqrt.cc
 tensorflow/core/kernels/cwise_op_neg.cc
 tensorflow/core/kernels/cwise_op_mul.cc
 tensorflow/core/kernels/cwise_op_minimum.cc
 tensorflow/core/kernels/cwise_op_maximum.cc
 tensorflow/core/kernels/cwise_op_log.cc
 tensorflow/core/kernels/cwise_op_less.cc
 tensorflow/core/kernels/cwise_op_isfinite.cc
 tensorflow/core/kernels/cwise_op_inverse.cc
 tensorflow/core/kernels/cwise_op_greater.cc
 tensorflow/core/kernels/cwise_op_exp.cc
 tensorflow/core/kernels/cwise_op_equal_to.cc
 tensorflow/core/kernels/cwise_op_div.cc
 tensorflow/core/kernels/cwise_op_add.cc
 tensorflow/core/kernels/ctc_decoder_ops.cc
 tensorflow/core/kernels/conv_ops.cc
 tensorflow/core/kernels/conv_grad_ops.cc
 tensorflow/core/kernels/control_flow_ops.cc
 tensorflow/core/kernels/constant_op.cc
 tensorflow/core/kernels/concat_op.cc
 tensorflow/core/kernels/concat_lib_cpu.cc
 tensorflow/core/kernels/check_numerics_op.cc
 tensorflow/core/kernels/cast_op.cc
 tensorflow/core/kernels/bias_op.cc
 tensorflow/core/kernels/bcast_ops.cc
 tensorflow/core/kernels/batch_norm_op.cc
 tensorflow/core/kernels/avgpooling_op.cc
 tensorflow/core/kernels/argmax_op.cc
 tensorflow/core/kernels/aggregate_ops.cc
 tensorflow/core/ops/training_ops.cc
 tensorflow/core/ops/string_ops.cc
 tensorflow/core/ops/state_ops.cc
 tensorflow/core/ops/sparse_ops.cc
 tensorflow/core/ops/sendrecv_ops.cc
 tensorflow/core/ops/script_ops.cc
 tensorflow/core/ops/random_ops.cc
 tensorflow/core/ops/random_grad.cc
 tensorflow/core/ops/parsing_ops.cc
 tensorflow/core/ops/no_op.cc
 tensorflow/core/ops/nn_ops.cc
 tensorflow/core/ops/nn_grad.cc
 tensorflow/core/ops/math_ops.cc
 tensorflow/core/ops/math_grad.cc
 tensorflow/core/ops/logging_ops.cc
 tensorflow/core/ops/linalg_ops.cc
 tensorflow/core/ops/io_ops.cc
 tensorflow/core/ops/image_ops.cc
 tensorflow/core/ops/functional_ops.cc
 tensorflow/core/ops/functional_grad.cc
 tensorflow/core/ops/function_ops.cc
 tensorflow/core/ops/data_flow_ops.cc
 tensorflow/core/ops/ctc_ops.cc
 tensorflow/core/ops/control_flow_ops.cc
 tensorflow/core/ops/candidate_sampling_ops.cc
 tensorflow/core/ops/array_ops.cc
 tensorflow/core/ops/array_grad.cc
--- a/tensorflow/contrib/quantization/BUILD
+++ b/tensorflow/contrib/quantization/BUILD
@ -69,6 +69,8 @@ py_library(
    srcs_version = "PY2AND3",
    deps = [
        ":ops",
        "//tensorflow/contrib/quantization:quantized_ops_py",
        "//tensorflow/contrib/quantization/kernels:quantized_kernels_py",
    ],
 )
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -603,6 +603,7 @@ filegroup(
            "graph/dot.*",
            "lib/jpeg/**/*",
            "lib/png/**/*",
            "lib/gif/**/*",
            "util/checkpoint_reader.*",
            "util/events_writer.*",
            "util/reporter.*",
@ -613,6 +614,7 @@ filegroup(
            "platform/google/**/*",
            "platform/jpeg.*",
            "platform/png.*",
            "platform/gif.*",
            "platform/stream_executor.*",
            "user_ops/**/*.cu.cc",
            "common_runtime/gpu/**/*",
@ -843,6 +845,7 @@ cc_library(
    hdrs = [
        "lib/core/blocking_counter.h",
        "lib/core/refcount.h",
        "lib/gif/gif_io.h",
        "lib/gtl/edit_distance.h",
        "lib/gtl/int_type.h",
        "lib/gtl/iterator_range.h",
@ -1967,6 +1970,10 @@ filegroup(
        "lib/jpeg/testdata/corrupt34_3.jpg",
        # -- hand-edited variant: stops after a restart marker
        "lib/jpeg/testdata/corrupt34_4.jpg",
        # GIF data
        "lib/gif/testdata/scan.gif",
        # GIF data with optimization
        "lib/gif/testdata/optimized.gif",
    ],
 )
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -859,6 +859,7 @@ tf_kernel_libraries(
        "crop_and_resize_op",
        "decode_jpeg_op",
        "decode_png_op",
        "decode_gif_op",
        "draw_bounding_box_op",
        "encode_jpeg_op",
        "attention_ops",
@ -1108,6 +1109,7 @@ tf_kernel_libraries(
        "matmul_op",
        "reduction_ops",
        "segment_reduction_ops",
        "scan_ops",
        "sequence_ops",
        "sparse_matmul_op",
    ],
@ -2040,6 +2042,7 @@ filegroup(
            "decode_png_op.*",
            "encode_jpeg_op.*",
            "decode_jpeg_op.*",
            "decode_gif_op.*",
            "identity_reader_op.*",
            "reader_base.*",
            "fixed_length_record_reader_op.*",
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@ -36,7 +36,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-template <typename Device>
+template <typename Device, typename T>
 class RGBToHSVOp : public OpKernel {
 public:
  explicit RGBToHSVOp(OpKernelConstruction* context) : OpKernel(context) {}
@ -59,23 +59,23 @@ class RGBToHSVOp : public OpKernel {
    // Make a canonical image, maintaining the last (channel) dimension, while
    // flattening all others do give the functor easy to work with data.
-    TTypes<float, 2>::ConstTensor input_data = input.flat_inner_dims<float>();
+    typename TTypes<T, 2>::ConstTensor input_data = input.flat_inner_dims<T>();
-    TTypes<float, 2>::Tensor output_data = output->flat_inner_dims<float>();
+    typename TTypes<T, 2>::Tensor output_data = output->flat_inner_dims<T>();
    Tensor trange;
    OP_REQUIRES_OK(
-        context, context->allocate_temp(DataTypeToEnum<float>::value,
+        context, context->allocate_temp(DataTypeToEnum<T>::value,
                                        TensorShape({input_data.dimension(0)}),
                                        &trange));
-    TTypes<float, 1>::Tensor range = trange.tensor<float, 1>();
+    typename TTypes<T, 1>::Tensor range = trange.tensor<T, 1>();
-    functor::RGBToHSV<Device>()(context->eigen_device<Device>(), input_data,
+    functor::RGBToHSV<Device, T>()(context->eigen_device<Device>(), input_data,
-                                range, output_data);
+                                   range, output_data);
  }
 };
-template <typename Device>
+template <typename Device, typename T>
 class HSVToRGBOp : public OpKernel {
 public:
  explicit HSVToRGBOp(OpKernelConstruction* context) : OpKernel(context) {}
@ -96,41 +96,54 @@ class HSVToRGBOp : public OpKernel {
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, input.shape(), &output));
-    TTypes<float, 2>::ConstTensor input_data = input.flat_inner_dims<float>();
+    typename TTypes<T, 2>::ConstTensor input_data = input.flat_inner_dims<T>();
-    TTypes<float, 2>::Tensor output_data = output->flat_inner_dims<float>();
+    typename TTypes<T, 2>::Tensor output_data = output->flat_inner_dims<T>();
-    functor::HSVToRGB<Device>()(context->eigen_device<Device>(), input_data,
+    functor::HSVToRGB<Device, T>()(context->eigen_device<Device>(), input_data,
-                                output_data);
+                                   output_data);
  }
 };
-REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU),
+#define REGISTER_CPU(T)                                       \
-                        RGBToHSVOp<CPUDevice>);
+  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \
-template class RGBToHSVOp<CPUDevice>;
+                              .TypeConstraint<T>("T"),        \
-REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU),
+                          RGBToHSVOp<CPUDevice, T>);          \
-                        HSVToRGBOp<CPUDevice>);
+  template class RGBToHSVOp<CPUDevice, T>;                    \
-template class HSVToRGBOp<CPUDevice>;
+  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \
                              .TypeConstraint<T>("T"),        \
                          HSVToRGBOp<CPUDevice, T>);          \
  template class HSVToRGBOp<CPUDevice, T>;
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
 #if GOOGLE_CUDA
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
 namespace functor {
-template <>
+#define DECLARE_GPU(T)                                        \
-void RGBToHSV<GPUDevice>::operator()(const GPUDevice& d,
+  template <>                                                 \
-                                     TTypes<float, 2>::ConstTensor input_data,
+  void RGBToHSV<GPUDevice, T>::operator()(const GPUDevice& d, \
-                                     TTypes<float, 1>::Tensor range,
+      TTypes<T, 2>::ConstTensor input_data,                   \
-                                     TTypes<float, 2>::Tensor output_data);
+      TTypes<T, 1>::Tensor range,                             \
-extern template struct RGBToHSV<GPUDevice>;
+      TTypes<T, 2>::Tensor output_data);                      \
-template <>
+  extern template struct RGBToHSV<GPUDevice, T>;              \
-void HSVToRGB<GPUDevice>::operator()(const GPUDevice& d,
+  template <>                                                 \
-                                     TTypes<float, 2>::ConstTensor input_data,
+  void HSVToRGB<GPUDevice, T>::operator()(const GPUDevice& d, \
-                                     TTypes<float, 2>::Tensor output_data);
+      TTypes<T, 2>::ConstTensor input_data,                   \
-extern template struct HSVToRGB<GPUDevice>;
+      TTypes<T, 2>::Tensor output_data);                      \
  extern template struct HSVToRGB<GPUDevice, T>;
 TF_CALL_float(DECLARE_GPU);
 TF_CALL_double(DECLARE_GPU);
 }  // namespace functor
-REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU),
+#define REGISTER_GPU(T)                                       \
-                        RGBToHSVOp<GPUDevice>);
+  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \
-REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU),
+                              .TypeConstraint<T>("T"),        \
-                        HSVToRGBOp<GPUDevice>);
+                          RGBToHSVOp<GPUDevice, T>);          \
  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \
                              .TypeConstraint<T>("T"),        \
                          HSVToRGBOp<GPUDevice, T>);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #endif
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/colorspace_op.h
+++ b/tensorflow/core/kernels/colorspace_op.h
@ -24,18 +24,19 @@ namespace tensorflow {
 namespace functor {
-template <typename Device>
+template <typename Device, typename T>
 struct RGBToHSV {
-  void operator()(const Device &d, TTypes<float, 2>::ConstTensor input_data,
+  void operator()(const Device &d,
-                  TTypes<float, 1>::Tensor range,
+                  typename TTypes<T, 2>::ConstTensor input_data,
-                  TTypes<float, 2>::Tensor output_data) {
+                  typename TTypes<T, 1>::Tensor range,
-    auto H = output_data.chip<1>(0);
+                  typename TTypes<T, 2>::Tensor output_data) {
-    auto S = output_data.chip<1>(1);
+    auto H = output_data.template chip<1>(0);
-    auto V = output_data.chip<1>(2);
+    auto S = output_data.template chip<1>(1);
    auto V = output_data.template chip<1>(2);
-    auto R = input_data.chip<1>(0);
+    auto R = input_data.template chip<1>(0);
-    auto G = input_data.chip<1>(1);
+    auto G = input_data.template chip<1>(1);
-    auto B = input_data.chip<1>(2);
+    auto B = input_data.template chip<1>(2);
 #if !defined(EIGEN_HAS_INDEX_LIST)
    Eigen::array<int, 1> channel_axis{{1}};
@ -47,38 +48,40 @@ struct RGBToHSV {
    range.device(d) = V - input_data.minimum(channel_axis);
-    S.device(d) = (V > 0.f).select(range / V, V.constant(0.f));
+    S.device(d) = (V > T(0)).select(range / V, V.constant(T(0)));
-    auto norm = range.inverse() * (1.f / 6.f);
+    auto norm = range.inverse() * (T(1) / T(6));
    // TODO(wicke): all these assignments are only necessary because a combined
    // expression is larger than kernel parameter space. A custom kernel is
    // probably in order.
    H.device(d) = (R == V).select(norm * (G - B),
-                                  (G == V).select(norm * (B - R) + 2.f / 6.f,
+                                  (G == V).select(
-                                                  norm * (R - G) + 4.f / 6.f));
+                                      norm * (B - R) + T(2) / T(6),
-    H.device(d) = (range > 0.f).select(H, H.constant(0.f));
+                                      norm * (R - G) + T(4) / T(6)));
-    H.device(d) = (H < 0.f).select(H + 1.f, H);
+    H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
    H.device(d) = (H < T(0)).select(H + T(1), H);
  }
 };
-template <typename Device>
+template <typename Device, typename T>
 struct HSVToRGB {
-  void operator()(const Device &d, TTypes<float, 2>::ConstTensor input_data,
+  void operator()(const Device &d,
-                  TTypes<float, 2>::Tensor output_data) {
+                  typename TTypes<T, 2>::ConstTensor input_data,
-    auto H = input_data.chip<1>(0);
+                  typename TTypes<T, 2>::Tensor output_data) {
-    auto S = input_data.chip<1>(1);
+    auto H = input_data.template chip<1>(0);
-    auto V = input_data.chip<1>(2);
+    auto S = input_data.template chip<1>(1);
    auto V = input_data.template chip<1>(2);
    // TODO(wicke): compute only the fractional part of H for robustness
-    auto dh = H * 6.f;
+    auto dh = H * T(6);
-    auto dr = ((dh - 3.f).abs() - 1.f).cwiseMax(0.f).cwiseMin(1.f);
+    auto dr = ((dh - T(3)).abs() - T(1)).cwiseMax(T(0)).cwiseMin(T(1));
-    auto dg = (-(dh - 2.f).abs() + 2.f).cwiseMax(0.f).cwiseMin(1.f);
+    auto dg = (-(dh - T(2)).abs() + T(2)).cwiseMax(T(0)).cwiseMin(T(1));
-    auto db = (-(dh - 4.f).abs() + 2.f).cwiseMax(0.f).cwiseMin(1.f);
+    auto db = (-(dh - T(4)).abs() + T(2)).cwiseMax(T(0)).cwiseMin(T(1));
-    auto one_s = -S + 1.f;
+    auto one_s = -S + T(1);
-    auto R = output_data.chip<1>(0);
+    auto R = output_data.template chip<1>(0);
-    auto G = output_data.chip<1>(1);
+    auto G = output_data.template chip<1>(1);
-    auto B = output_data.chip<1>(2);
+    auto B = output_data.template chip<1>(2);
    R.device(d) = (one_s + S * dr) * V;
    G.device(d) = (one_s + S * dg) * V;
--- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
@ -24,8 +24,11 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
-template class functor::RGBToHSV<GPUDevice>;
+#define INSTANTIATE_GPU(T)                        \
-template class functor::HSVToRGB<GPUDevice>;
+  template class functor::RGBToHSV<GPUDevice, T>; \
  template class functor::HSVToRGB<GPUDevice, T>;
 TF_CALL_float(INSTANTIATE_GPU);
 TF_CALL_double(INSTANTIATE_GPU);
 }
 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/colorspace_op_test.cc
+++ b/tensorflow/core/kernels/colorspace_op_test.cc
@ -29,183 +29,241 @@ limitations under the License.
 namespace tensorflow {
 template <typename T>
 class RGBToHSVOpTest : public OpsTestBase {
 protected:
-  RGBToHSVOpTest() {
+  void MakeOp(DataType data_type) {
    TF_EXPECT_OK(NodeDefBuilder("rgb_to_hsv_op", "RGBToHSV")
-                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(data_type))
                     .Finalize(node_def()));
    TF_EXPECT_OK(InitOp());
  }
  void CheckBlack(DataType data_type) {
    // Black pixel should map to hsv = [0,0,0]
    AddInputFromArray<T>(TensorShape({3}), {0, 0, 0});
    TF_ASSERT_OK(RunOpKernel());
    Tensor expected(allocator(), data_type, TensorShape({3}));
    test::FillValues<T>(&expected, {0.0, 0.0, 0.0});
    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
  }
  void CheckGray(DataType data_type) {
    // Gray pixel should have hue = saturation = 0.0, value = r/255
    AddInputFromArray<T>(TensorShape({3}), {.5, .5, .5});
    TF_ASSERT_OK(RunOpKernel());
    Tensor expected(allocator(), data_type, TensorShape({3}));
    test::FillValues<T>(&expected, {0.0, 0.0, .5});
    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
  }
  void CheckWhite(DataType data_type) {
    // Gray pixel should have hue = saturation = 0.0, value = 1.0
    AddInputFromArray<T>(TensorShape({3}), {1, 1, 1});
    TF_ASSERT_OK(RunOpKernel());
    Tensor expected(allocator(), data_type, TensorShape({3}));
    test::FillValues<T>(&expected, {0.0, 0.0, 1.0});
    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
  }
  void CheckRedMax(DataType data_type) {
    // Test case where red channel dominates
    AddInputFromArray<T>(TensorShape({3}), {.8, .4, .2});
    TF_ASSERT_OK(RunOpKernel());
    T expected_h = 1. / 6. * .2 / .6;
    T expected_s = .6 / .8;
    T expected_v = .8 / 1.;
    Tensor expected(allocator(), data_type, TensorShape({3}));
    test::FillValues<T>(&expected, {expected_h, expected_s, expected_v});
    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
  }
  void CheckGreenMax(DataType data_type) {
    // Test case where green channel dominates
    AddInputFromArray<T>(TensorShape({3}), {.2, .8, .4});
    TF_ASSERT_OK(RunOpKernel());
    T expected_h = 1. / 6. * (2.0 + (.2 / .6));
    T expected_s = .6 / .8;
    T expected_v = .8 / 1.;
    Tensor expected(allocator(), data_type, TensorShape({3}));
    test::FillValues<T>(&expected, {expected_h, expected_s, expected_v});
    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
  }
  void CheckBlueMax(DataType data_type) {
    // Test case where blue channel dominates
    AddInputFromArray<T>(TensorShape({3}), {.4, .2, .8});
    TF_ASSERT_OK(RunOpKernel());
    T expected_h = 1. / 6. * (4.0 + (.2 / .6));
    T expected_s = .6 / .8;
    T expected_v = .8 / 1.;
    Tensor expected(allocator(), data_type, TensorShape({3}));
    test::FillValues<T>(&expected, {expected_h, expected_s, expected_v});
    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
  }
  void CheckNegativeDifference(DataType data_type) {
    AddInputFromArray<T>(TensorShape({3}), {0, .1, .2});
    TF_ASSERT_OK(RunOpKernel());
    T expected_h = 1. / 6. * (4.0 + (-.1 / .2));
    T expected_s = .2 / .2;
    T expected_v = .2 / 1.;
    Tensor expected(allocator(), data_type, TensorShape({3}));
    test::FillValues<T>(&expected, {expected_h, expected_s, expected_v});
    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
  }
 };
-TEST_F(RGBToHSVOpTest, CheckBlack) {
+template <typename T>
  // Black pixel should map to hsv = [0,0,0]
  AddInputFromArray<float>(TensorShape({3}), {0, 0, 0});
  TF_ASSERT_OK(RunOpKernel());
  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
  test::FillValues<float>(&expected, {0.0, 0.0, 0.0});
  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 TEST_F(RGBToHSVOpTest, CheckGray) {
  // Gray pixel should have hue = saturation = 0.0, value = r/255
  AddInputFromArray<float>(TensorShape({3}), {.5, .5, .5});
  TF_ASSERT_OK(RunOpKernel());
  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
  test::FillValues<float>(&expected, {0.0, 0.0, .5});
  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 TEST_F(RGBToHSVOpTest, CheckWhite) {
  // Gray pixel should have hue = saturation = 0.0, value = 1.0
  AddInputFromArray<float>(TensorShape({3}), {1, 1, 1});
  TF_ASSERT_OK(RunOpKernel());
  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
  test::FillValues<float>(&expected, {0.0, 0.0, 1.0});
  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 TEST_F(RGBToHSVOpTest, CheckRedMax) {
  // Test case where red channel dominates
  AddInputFromArray<float>(TensorShape({3}), {.8, .4, .2});
  TF_ASSERT_OK(RunOpKernel());
  float expected_h = 1. / 6. * .2 / .6;
  float expected_s = .6 / .8;
  float expected_v = .8 / 1.;
  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
  test::FillValues<float>(&expected, {expected_h, expected_s, expected_v});
  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
 }
 TEST_F(RGBToHSVOpTest, CheckGreenMax) {
  // Test case where green channel dominates
  AddInputFromArray<float>(TensorShape({3}), {.2, .8, .4});
  TF_ASSERT_OK(RunOpKernel());
  float expected_h = 1. / 6. * (2.0 + (.2 / .6));
  float expected_s = .6 / .8;
  float expected_v = .8 / 1.;
  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
  test::FillValues<float>(&expected, {expected_h, expected_s, expected_v});
  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
 }
 TEST_F(RGBToHSVOpTest, CheckBlueMax) {
  // Test case where blue channel dominates
  AddInputFromArray<float>(TensorShape({3}), {.4, .2, .8});
  TF_ASSERT_OK(RunOpKernel());
  float expected_h = 1. / 6. * (4.0 + (.2 / .6));
  float expected_s = .6 / .8;
  float expected_v = .8 / 1.;
  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
  test::FillValues<float>(&expected, {expected_h, expected_s, expected_v});
  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
 }
 TEST_F(RGBToHSVOpTest, CheckNegativeDifference) {
  AddInputFromArray<float>(TensorShape({3}), {0, .1, .2});
  TF_ASSERT_OK(RunOpKernel());
  float expected_h = 1. / 6. * (4.0 + (-.1 / .2));
  float expected_s = .2 / .2;
  float expected_v = .2 / 1.;
  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
  test::FillValues<float>(&expected, {expected_h, expected_s, expected_v});
  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
 }
 class HSVToRGBOpTest : public OpsTestBase {
 protected:
-  HSVToRGBOpTest() {
+  void MakeOp(DataType data_type) {
    TF_EXPECT_OK(NodeDefBuilder("hsv_to_rgb_op", "HSVToRGB")
-                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(data_type))
                     .Finalize(node_def()));
    TF_EXPECT_OK(InitOp());
  }
  void CheckBlack(DataType data_type) {
    // Black pixel should map to rgb = [0,0,0]
    AddInputFromArray<T>(TensorShape({3}), {0.0, 0.0, 0.0});
    TF_ASSERT_OK(RunOpKernel());
    Tensor expected(allocator(), data_type, TensorShape({3}));
    test::FillValues<T>(&expected, {0, 0, 0});
    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
  }
  void CheckGray(DataType data_type) {
    // Gray pixel should have hue = saturation = 0.0, value = r/255
    AddInputFromArray<T>(TensorShape({3}), {0.0, 0.0, .5});
    TF_ASSERT_OK(RunOpKernel());
    Tensor expected(allocator(), data_type, TensorShape({3}));
    test::FillValues<T>(&expected, {.5, .5, .5});
    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
  }
  void CheckWhite(DataType data_type) {
    // Gray pixel should have hue = saturation = 0.0, value = 1.0
    AddInputFromArray<T>(TensorShape({3}), {0.0, 0.0, 1.0});
    TF_ASSERT_OK(RunOpKernel());
    Tensor expected(allocator(), data_type, TensorShape({3}));
    test::FillValues<T>(&expected, {1, 1, 1});
    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
  }
  void CheckRedMax(DataType data_type) {
    // Test case where red channel dominates
    T expected_h = 1. / 6. * .2 / .6;
    T expected_s = .6 / .8;
    T expected_v = .8 / 1.;
    AddInputFromArray<T>(TensorShape({3}),
                         {expected_h, expected_s, expected_v});
    TF_ASSERT_OK(RunOpKernel());
    Tensor expected(allocator(), data_type, TensorShape({3}));
    test::FillValues<T>(&expected, {.8, .4, .2});
    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
  }
  void CheckGreenMax(DataType data_type) {
    // Test case where green channel dominates
    T expected_h = 1. / 6. * (2.0 + (.2 / .6));
    T expected_s = .6 / .8;
    T expected_v = .8 / 1.;
    AddInputFromArray<T>(TensorShape({3}),
                         {expected_h, expected_s, expected_v});
    TF_ASSERT_OK(RunOpKernel());
    Tensor expected(allocator(), data_type, TensorShape({3}));
    test::FillValues<T>(&expected, {.2, .8, .4});
    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
  }
  void CheckBlueMax(DataType data_type) {
    // Test case where blue channel dominates
    T expected_h = 1. / 6. * (4.0 + (.2 / .6));
    T expected_s = .6 / .8;
    T expected_v = .8 / 1.0;
    AddInputFromArray<T>(TensorShape({3}),
                         {expected_h, expected_s, expected_v});
    TF_ASSERT_OK(RunOpKernel());
    Tensor expected(allocator(), data_type, TensorShape({3}));
    test::FillValues<T>(&expected, {.4, .2, .8});
    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
  }
  void CheckNegativeDifference(DataType data_type) {
    T expected_h = 1. / 6. * (4.0 + (-.1 / .2));
    T expected_s = .2 / .2;
    T expected_v = .2 / 1.;
    AddInputFromArray<T>(TensorShape({3}),
                         {expected_h, expected_s, expected_v});
    TF_ASSERT_OK(RunOpKernel());
    Tensor expected(allocator(), data_type, TensorShape({3}));
    test::FillValues<T>(&expected, {0, .1, .2});
    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
  }
 };
-TEST_F(HSVToRGBOpTest, CheckBlack) {
+#define TEST_COLORSPACE(test, dt)                               \
-  // Black pixel should map to rgb = [0,0,0]
+  TEST_F(test, CheckBlack) {                                    \
-  AddInputFromArray<float>(TensorShape({3}), {0.0, 0.0, 0.0});
+    MakeOp(dt);                                                 \
-  TF_ASSERT_OK(RunOpKernel());
+    CheckBlack(dt);                                             \
  }                                                             \
  TEST_F(test, CheckGray) {                                     \
    MakeOp(dt);                                                 \
    CheckGray(dt);                                              \
  }                                                             \
  TEST_F(test, CheckWhite) {                                    \
    MakeOp(dt);                                                 \
    CheckWhite(dt);                                             \
  }                                                             \
  TEST_F(test, CheckRedMax) {                                   \
    MakeOp(dt);                                                 \
    CheckRedMax(dt);                                            \
  }                                                             \
  TEST_F(test, CheckGreenMax) {                                 \
    MakeOp(dt);                                                 \
    CheckGreenMax(dt);                                          \
  }                                                             \
  TEST_F(test, CheckBlueMax) {                                  \
    MakeOp(dt);                                                 \
    CheckBlueMax(dt);                                           \
  }                                                             \
  TEST_F(test, CheckNegativeDifference) {                       \
    MakeOp(dt);                                                 \
    CheckNegativeDifference(dt);                                \
  }
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
+typedef RGBToHSVOpTest<float> rgb_to_hsv_float;
-  test::FillValues<float>(&expected, {0, 0, 0});
+typedef RGBToHSVOpTest<double> rgb_to_hsv_double;
  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
-TEST_F(HSVToRGBOpTest, CheckGray) {
+TEST_COLORSPACE(rgb_to_hsv_float, DT_FLOAT);
-  // Gray pixel should have hue = saturation = 0.0, value = r/255
+TEST_COLORSPACE(rgb_to_hsv_double, DT_DOUBLE);
  AddInputFromArray<float>(TensorShape({3}), {0.0, 0.0, .5});
  TF_ASSERT_OK(RunOpKernel());
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
+typedef HSVToRGBOpTest<float> hsv_to_rgb_float;
-  test::FillValues<float>(&expected, {.5, .5, .5});
+typedef HSVToRGBOpTest<double> hsv_to_rgb_double;
  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
-TEST_F(HSVToRGBOpTest, CheckWhite) {
+TEST_COLORSPACE(hsv_to_rgb_float, DT_FLOAT);
-  // Gray pixel should have hue = saturation = 0.0, value = 1.0
+TEST_COLORSPACE(hsv_to_rgb_double, DT_DOUBLE);
  AddInputFromArray<float>(TensorShape({3}), {0.0, 0.0, 1.0});
  TF_ASSERT_OK(RunOpKernel());
  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
  test::FillValues<float>(&expected, {1, 1, 1});
  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 TEST_F(HSVToRGBOpTest, CheckRedMax) {
  // Test case where red channel dominates
  float expected_h = 1. / 6. * .2 / .6;
  float expected_s = .6 / .8;
  float expected_v = .8 / 1.;
  AddInputFromArray<float>(TensorShape({3}),
                           {expected_h, expected_s, expected_v});
  TF_ASSERT_OK(RunOpKernel());
  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
  test::FillValues<float>(&expected, {.8, .4, .2});
  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
 }
 TEST_F(HSVToRGBOpTest, CheckGreenMax) {
  // Test case where green channel dominates
  float expected_h = 1. / 6. * (2.0 + (.2 / .6));
  float expected_s = .6 / .8;
  float expected_v = .8 / 1.;
  AddInputFromArray<float>(TensorShape({3}),
                           {expected_h, expected_s, expected_v});
  TF_ASSERT_OK(RunOpKernel());
  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
  test::FillValues<float>(&expected, {.2, .8, .4});
  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
 }
 TEST_F(HSVToRGBOpTest, CheckBlueMax) {
  // Test case where blue channel dominates
  float expected_h = 1. / 6. * (4.0 + (.2 / .6));
  float expected_s = .6 / .8;
  float expected_v = .8 / 1.0;
  AddInputFromArray<float>(TensorShape({3}),
                           {expected_h, expected_s, expected_v});
  TF_ASSERT_OK(RunOpKernel());
  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
  test::FillValues<float>(&expected, {.4, .2, .8});
  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
 }
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@ -65,7 +65,7 @@ class BinaryOpShared : public OpKernel {
 // Coefficient-wise binary operations:
 //   Device: E.g., CPUDevice, GPUDevice.
-//   Functor: defined in cwise_functors.h. E.g., functor::add2.
+//   Functor: defined in cwise_ops.h. E.g., functor::add.
 template <typename Device, typename Functor>
 class BinaryOp : public BinaryOpShared {
 public:
@ -162,7 +162,7 @@ class SimpleBinaryOp : public OpKernel {
 // Coefficient-wise unary operations:
 //   Device: E.g., CPUDevice, GPUDevice.
-//   Functor: defined in cwise_functors.h. E.g., functor::sqrt.
+//   Functor: defined in cwise_ops.h. E.g., functor::sqrt.
 template <typename Device, typename Functor>
 class UnaryOp : public OpKernel {
 public:
--- a/tensorflow/core/kernels/decode_gif_op.cc
+++ b/tensorflow/core/kernels/decode_gif_op.cc
@ -0,0 +1,66 @@
 /* Copyright 2015 Google Inc. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // See docs in ../ops/image_ops.cc
 #include <memory>
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gif/gif_io.h"
 #include "tensorflow/core/platform/logging.h"
 namespace tensorflow {
 // Decode the contents of a GIF file
 class DecodeGifOp : public OpKernel {
 public:
  explicit DecodeGifOp(OpKernelConstruction* context) : OpKernel(context) {}
  void Compute(OpKernelContext* context) override {
    const Tensor& contents = context->input(0);
    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
                errors::InvalidArgument("contents must be scalar, got shape ",
                                        contents.shape().DebugString()));
    // Start decoding image to get shape details
    const StringPiece input = contents.scalar<string>()();
    // Decode image, allocating tensor once the image size is known
    Tensor* output = nullptr;
    OP_REQUIRES(
        context,
        gif::Decode(input.data(), input.size(),
                    [=, &output](int num_frames, int width, int height,
                                 int channels) -> uint8* {
                      Status status(context->allocate_output(
                          0, TensorShape({num_frames, height, width, channels}),
                          &output));
                      if (!status.ok()) {
                        VLOG(1) << status;
                        context->SetStatus(status);
                        return nullptr;
                      }
                      return output->flat<uint8>().data();
                    }),
        errors::InvalidArgument("Invalid GIF data, size ", input.size()));
  }
 };
 REGISTER_KERNEL_BUILDER(Name("DecodeGif").Device(DEVICE_CPU), DecodeGifOp);
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@ -97,13 +97,7 @@ class ReverseOp : public OpKernel {
                              .HostMemory("dims"),    \
                          ReverseOp<CPUDevice, T>)
-TF_CALL_uint8(REGISTER_KERNEL);
+TF_CALL_POD_TYPES(REGISTER_KERNEL);
 TF_CALL_int8(REGISTER_KERNEL);
 TF_CALL_int32(REGISTER_KERNEL);
 TF_CALL_bool(REGISTER_KERNEL);
 TF_CALL_half(REGISTER_KERNEL);
 TF_CALL_float(REGISTER_KERNEL);
 TF_CALL_double(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 #if GOOGLE_CUDA
@ -136,6 +130,8 @@ TF_CALL_bool(DECLARE_GPU_SPEC);
 TF_CALL_half(DECLARE_GPU_SPEC);
 TF_CALL_float(DECLARE_GPU_SPEC);
 TF_CALL_double(DECLARE_GPU_SPEC);
 TF_CALL_complex64(DECLARE_GPU_SPEC);
 TF_CALL_complex128(DECLARE_GPU_SPEC);
 #undef DECLARE_GPU_SPEC
 #undef DECLARE_GPU_SPEC_DIM
 }  // namespace functor
@ -149,9 +145,15 @@ TF_CALL_double(DECLARE_GPU_SPEC);
                          ReverseOp<GPUDevice, T>)
 TF_CALL_uint8(REGISTER_GPU_KERNEL);
 TF_CALL_int8(REGISTER_GPU_KERNEL);
 // TODO Find out why the int32 GPU kernel doesn't work
 // and decide whether we want to enable the bool kernel.
 //TF_CALL_int32(REGISTER_GPU_KERNEL);
 //TF_CALL_bool(REGISTER_GPU_KERNEL);
 TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
 TF_CALL_double(REGISTER_GPU_KERNEL);
 TF_CALL_complex64(REGISTER_GPU_KERNEL);
 TF_CALL_complex128(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/reverse_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
@ -25,24 +25,30 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
-#define DEFINE_REVERSE(DIM)                                      \
+#define DEFINE_REVERSE(T, DIM) \
-  template struct functor::Reverse<GPUDevice, uint8, DIM>;       \
+  template struct functor::Reverse<GPUDevice, T, DIM>;
-  template struct functor::Reverse<GPUDevice, int8, DIM>;        \
+#define DEFINE_REVERSE_ALL_DIMS(T) \
-  template struct functor::Reverse<GPUDevice, int32, DIM>;       \
+  DEFINE_REVERSE(T, 0) \
-  template struct functor::Reverse<GPUDevice, bool, DIM>;        \
+  DEFINE_REVERSE(T, 1) \
-  template struct functor::Reverse<GPUDevice, Eigen::half, DIM>; \
+  DEFINE_REVERSE(T, 2) \
-  template struct functor::Reverse<GPUDevice, float, DIM>;       \
+  DEFINE_REVERSE(T, 3) \
-  template struct functor::Reverse<GPUDevice, double, DIM>;
+  DEFINE_REVERSE(T, 4) \
-DEFINE_REVERSE(0)
+  DEFINE_REVERSE(T, 5) \
-DEFINE_REVERSE(1)
+  DEFINE_REVERSE(T, 6) \
-DEFINE_REVERSE(2)
+  DEFINE_REVERSE(T, 7) \
-DEFINE_REVERSE(3)
+  DEFINE_REVERSE(T, 8)
-DEFINE_REVERSE(4)
+
-DEFINE_REVERSE(5)
+TF_CALL_uint8(DEFINE_REVERSE_ALL_DIMS);
-DEFINE_REVERSE(6)
+TF_CALL_int8(DEFINE_REVERSE_ALL_DIMS);
-DEFINE_REVERSE(7)
+TF_CALL_int32(DEFINE_REVERSE_ALL_DIMS);
-DEFINE_REVERSE(8)
+TF_CALL_bool(DEFINE_REVERSE_ALL_DIMS);
 TF_CALL_half(DEFINE_REVERSE_ALL_DIMS);
 TF_CALL_float(DEFINE_REVERSE_ALL_DIMS);
 TF_CALL_double(DEFINE_REVERSE_ALL_DIMS);
 TF_CALL_complex64(DEFINE_REVERSE_ALL_DIMS);
 TF_CALL_complex128(DEFINE_REVERSE_ALL_DIMS);
 #undef DEFINE_REVERSE
 #undef DEFINE_REVERSE_ALL_DIMS
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/scan_ops.cc
+++ b/tensorflow/core/kernels/scan_ops.cc
@ -0,0 +1,177 @@
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #define EIGEN_USE_THREADS
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/scan_ops.h"
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 template <typename Device, class T, typename Reducer>
 class ScanOp : public OpKernel {
 public:
  explicit ScanOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
    OP_REQUIRES_OK(ctx, ctx->GetAttr("reverse", &reverse_));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("exclusive", &exclusive_));
  }
  void Compute(OpKernelContext* ctx) override {
    const Tensor& input = ctx->input(0);
    const Tensor& tensor_axis = ctx->input(1);
    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_axis.shape()),
                errors::InvalidArgument("ScanOp: axis must be a scalar, not ",
                                        tensor_axis.shape().DebugString()));
    const int axis = internal::SubtleMustCopy(tensor_axis.scalar<int>()());
    OP_REQUIRES(
        ctx, FastBoundsCheck(axis, input.dims()),
        errors::InvalidArgument("ScanOp: Expected scan axis in the range [", 0,
                                ", ", input.dims(), "), but got ", axis));
    TensorShape output_shape = input.shape();
    Tensor* output = nullptr;
    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output));
    const Device& d = ctx->eigen_device<Device>();
    Reducer reducer;
 #define HANDLE_SCAN(NDIMS)                                                \
  case NDIMS:                                                             \
    functor::Scan<Device, Reducer, T, NDIMS>()(                           \
        d, input.tensor<T, NDIMS>(), output->tensor<T, NDIMS>(), reducer, \
        axis, reverse_, exclusive_);                                      \
    return;
    switch (input.dims()) {
      // input.dims() == 0 can't occur as there
      // is no valid axis parameter in this case
      HANDLE_SCAN(1);
      HANDLE_SCAN(2);
      HANDLE_SCAN(3);
      HANDLE_SCAN(4);
      HANDLE_SCAN(5);
      HANDLE_SCAN(6);
      HANDLE_SCAN(7);
      HANDLE_SCAN(8);
      default:
        OP_REQUIRES(ctx, false, errors::InvalidArgument(
                                    "Scan does not support tensors with "
                                    "more than 8 dimensions",
                                    input.dims()));
    }
 #undef HANDLE_SCAN
  }
 private:
  bool reverse_;
  bool exclusive_;
 };
 #ifdef GOOGLE_CUDA
 namespace functor {
 // Forward declarations of GPU functors
 #define DECLARE(REDUCER, T, D)                                             \
  template <>                                                              \
  void Scan<GPUDevice, REDUCER, T, D>::operator()(                         \
      const GPUDevice& d, TTypes<T, D>::ConstTensor in,                    \
      TTypes<T, D>::Tensor out, const REDUCER& reducer,                    \
      const Eigen::Index& axis, const bool reverse, const bool exclusive); \
  extern template struct Scan<GPUDevice, REDUCER, T, D>;
 #define DECLARE_FOR_ALL_DIMS(REDUCER, T) \
  DECLARE(REDUCER, T, 1);                \
  DECLARE(REDUCER, T, 2);                \
  DECLARE(REDUCER, T, 3);                \
  DECLARE(REDUCER, T, 4);                \
  DECLARE(REDUCER, T, 5);                \
  DECLARE(REDUCER, T, 6);                \
  DECLARE(REDUCER, T, 7);                \
  DECLARE(REDUCER, T, 8);
 #define DECLARE_FOR_ALL_REDUCERS(T)                        \
  DECLARE_FOR_ALL_DIMS(Eigen::internal::SumReducer<T>, T); \
  DECLARE_FOR_ALL_DIMS(Eigen::internal::ProdReducer<T>, T);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_ALL_REDUCERS);
 #undef DECLARE_FOR_ALL_REDUCERS
 #undef DECLARE_FOR_ALL_DIMS
 #undef DECLARE
 }  // namespace functor
 #endif  // GOOGLE_CUDA
 // Register Cumsum kernels
 #define REGISTER_CPU_KERNELS(type)                                 \
  REGISTER_KERNEL_BUILDER(                                         \
      Name("Cumsum").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
      ScanOp<CPUDevice, type, Eigen::internal::SumReducer<type>>)
 TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 #if GOOGLE_CUDA
 #define REGISTER_GPU_KERNELS(type)   \
  REGISTER_KERNEL_BUILDER(           \
      Name("Cumsum")                 \
          .Device(DEVICE_GPU)        \
          .TypeConstraint<type>("T") \
          .HostMemory("axis"),       \
      ScanOp<GPUDevice, type, Eigen::internal::SumReducer<type>>)
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS)
 #undef REGISTER_GPU_KERNELS
 #endif // GOOGLE_CUDA
 // Register Cumprod kernels
 #define REGISTER_CPU_KERNELS(type)                                  \
  REGISTER_KERNEL_BUILDER(                                          \
      Name("Cumprod").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
      ScanOp<CPUDevice, type, Eigen::internal::ProdReducer<type>>)
 TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 #if GOOGLE_CUDA
 #define REGISTER_GPU_KERNELS(type)   \
  REGISTER_KERNEL_BUILDER(           \
      Name("Cumprod")                \
          .Device(DEVICE_GPU)        \
          .TypeConstraint<type>("T") \
          .HostMemory("axis"),       \
      ScanOp<GPUDevice, type, Eigen::internal::ProdReducer<type>>)
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS)
 #undef REGISTER_GPU_KERNELS
 #endif // GOOGLE_CUDA
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/scan_ops.h
+++ b/tensorflow/core/kernels/scan_ops.h
@ -0,0 +1,47 @@
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_KERNELS_SCAN_OPS_H_
 #define TENSORFLOW_KERNELS_SCAN_OPS_H_
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 namespace tensorflow {
 namespace functor {
 typedef Eigen::Index Index;
 template <typename Device, typename Reducer, typename T, int Dims>
 struct Scan {
  void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor in,
                  typename TTypes<T, Dims>::Tensor out, const Reducer& reducer,
                  const Index& axis, const bool reverse, const bool exclusive) {
    // Perform the reverse ops directly with Eigen, which avoids copying the
    // tensor twice compared to using individual ops.
    Eigen::array<bool, Dims> dims;
    for (int i = 0; i < dims.size(); i++) {
      dims[i] = reverse && (i == axis);
    }
    To32Bit(out).device(d) = To32Bit(in).reverse(dims)
                                        .scan(axis, reducer, exclusive)
                                        .reverse(dims);
  }
 };
 }  // namespace functor
 }  // namespace tensorflow
 #endif  // TENSORFLOW_KERNELS_SCAN_OPS_H_
--- a/tensorflow/core/kernels/scan_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/scan_ops_gpu.cu.cc
@ -0,0 +1,54 @@
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/scan_ops.h"
 namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::Index Index;
 #define DEFINE(REDUCER, T, D) \
  template struct functor::Scan<GPUDevice, REDUCER, T, D>;
 #define DEFINE_FOR_ALL_DIMS(REDUCER, T) \
  DEFINE(REDUCER, T, 1);                \
  DEFINE(REDUCER, T, 2);                \
  DEFINE(REDUCER, T, 3);                \
  DEFINE(REDUCER, T, 4);                \
  DEFINE(REDUCER, T, 5);                \
  DEFINE(REDUCER, T, 6);                \
  DEFINE(REDUCER, T, 7);                \
  DEFINE(REDUCER, T, 8)
 #define DEFINE_FOR_ALL_REDUCERS(T)                        \
  DEFINE_FOR_ALL_DIMS(Eigen::internal::SumReducer<T>, T); \
  DEFINE_FOR_ALL_DIMS(Eigen::internal::ProdReducer<T>, T);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_FOR_ALL_REDUCERS);
 #undef DEFINE_FOR_ALL_REDUCERS
 #undef DEFINE_FOR_ALL_DIMS
 #undef DEFINE
 }  // end namespace tensorflow
 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@ -58,9 +58,8 @@ struct ApplyAdadelta<CPUDevice, T> {
                  typename TTypes<T>::ConstFlat grad) {
    accum.device(d) =
        accum * rho() + grad.square() * (static_cast<T>(1) - rho());
-    const auto update = 
+    const auto update =
-	(accum_update + epsilon()).sqrt() *
+        (accum_update + epsilon()).sqrt() * (accum + epsilon()).rsqrt() * grad;
 	(accum + epsilon()).rsqrt() * grad;
    accum_update.device(d) =
        accum_update * rho() + update.square() * (static_cast<T>(1) - rho());
    var.device(d) -= update * lr();
@ -176,9 +175,13 @@ struct ApplyMomentum<CPUDevice, T> {
                  typename TTypes<T>::Flat accum,
                  typename TTypes<T>::ConstScalar lr,
                  typename TTypes<T>::ConstFlat grad,
-                  typename TTypes<T>::ConstScalar momentum) {
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov) {
    accum.device(d) = accum * momentum() + grad;
-    var.device(d) -= accum * lr();
+    if (use_nesterov) {
      var.device(d) -= grad * lr() + accum * momentum() * lr();
    } else {
      var.device(d) -= accum * lr();
    }
  }
 };
@ -1515,6 +1518,7 @@ class ApplyMomentumOp : public OpKernel {
 public:
  explicit ApplyMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
  }
  void Compute(OpKernelContext* ctx) override {
@ -1554,12 +1558,13 @@ class ApplyMomentumOp : public OpKernel {
    const Device& device = ctx->template eigen_device<Device>();
    functor::ApplyMomentum<Device, T>()(device, var.flat<T>(), accum.flat<T>(),
                                        lr.scalar<T>(), grad.flat<T>(),
-                                        momentum.scalar<T>());
+                                        momentum.scalar<T>(), use_nesterov_);
    ctx->forward_ref_input_to_ref_output(0, 0);
  }
 private:
  bool use_exclusive_lock_;
  bool use_nesterov_;
 };
 typedef Eigen::ThreadPoolDevice CPUDevice;
@ -1584,7 +1589,7 @@ namespace functor {
      const GPUDevice& d, typename TTypes<T>::Flat var,                   \
      typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \
      typename TTypes<T>::ConstFlat grad,                                 \
-      typename TTypes<T>::ConstScalar momentum);                          \
+      typename TTypes<T>::ConstScalar momentum, bool use_nesterov);       \
  extern template struct ApplyMomentum<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@ -1605,6 +1610,7 @@ class SparseApplyMomentumOp : public OpKernel {
 public:
  explicit SparseApplyMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
  }
  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
@ -1672,7 +1678,12 @@ class SparseApplyMomentumOp : public OpKernel {
        auto g = grad_flat.template chip<0>(i);
        auto v = var_flat.template chip<0>(index);
        a = a * a.constant(momentum_scalar) + g;
-        v -= a.constant(lr_scalar) * a;
+        if (use_nesterov_) {
          v -= g.constant(lr_scalar) * g +
               a.constant(lr_scalar) * a.constant(momentum_scalar) * a;
        } else {
          v -= a.constant(lr_scalar) * a;
        }
      }
    }
@ -1681,6 +1692,7 @@ class SparseApplyMomentumOp : public OpKernel {
 private:
  bool use_exclusive_lock_;
  bool use_nesterov_;
 };
 #define REGISTER_KERNELS(T, Tindices)                                \
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_KERNELS_TRAINING_OPS_H_
 #define TENSORFLOW_KERNELS_TRAINING_OPS_H_
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 namespace tensorflow {
 namespace functor {
@ -98,7 +98,7 @@ struct ApplyMomentum {
                  typename TTypes<T>::Flat accum,
                  typename TTypes<T>::ConstScalar lr,
                  typename TTypes<T>::ConstFlat grad,
-                  typename TTypes<T>::ConstScalar momentum);
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov);
 };
 template <typename Device, typename T>
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@ -17,8 +17,8 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/training_ops.h"
 #include "tensorflow/core/framework/register_types.h"
 namespace tensorflow {
@ -84,12 +84,18 @@ struct ApplyMomentum<GPUDevice, T> {
                  typename TTypes<T>::Flat accum,
                  typename TTypes<T>::ConstScalar lr,
                  typename TTypes<T>::ConstFlat grad,
-                  typename TTypes<T>::ConstScalar momentum) {
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov) {
    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
    bcast[0] = grad.dimension(0);
    Eigen::Sizes<1> single;
    accum.device(d) = accum * momentum.reshape(single).broadcast(bcast) + grad;
-    var.device(d) -= lr.reshape(single).broadcast(bcast) * accum;
+    if (use_nesterov) {
      var.device(d) -= grad * lr.reshape(single).broadcast(bcast) +
                       accum * momentum.reshape(single).broadcast(bcast) *
                           lr.reshape(single).broadcast(bcast);
    } else {
      var.device(d) -= lr.reshape(single).broadcast(bcast) * accum;
    }
  }
 };
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@ -0,0 +1,95 @@
 /* Copyright 2015 Google Inc. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // Functions to read images in GIF format.
 #include "tensorflow/core/lib/gif/gif_io.h"
 #include "tensorflow/core/platform/gif.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
 namespace tensorflow {
 namespace gif {
 int input_callback(GifFileType* gif_file, GifByteType* buf, int size) {
  if (gif_file->UserData && memcpy(buf, gif_file->UserData, size)) {
    gif_file->UserData = ((uint8_t*)gif_file->UserData) + size;
    return size;
  }
  return 0;
 }
 uint8* Decode(const void* srcdata, int datasize,
              std::function<uint8*(int, int, int, int)> allocate_output) {
  int error_code = D_GIF_SUCCEEDED;
  GifFileType* gif_file =
      DGifOpen(const_cast<void*>(srcdata), &input_callback, &error_code);
  if (error_code != D_GIF_SUCCEEDED) {
    LOG(ERROR) << "Fail to open gif file, reason: "
               << GifErrorString(error_code);
    return nullptr;
  }
  if (DGifSlurp(gif_file) != GIF_OK) {
    LOG(ERROR) << "Fail to slurp gif file, reason: "
               << GifErrorString(gif_file->Error);
    return nullptr;
  }
  if (gif_file->ImageCount <= 0) {
    LOG(ERROR) << "Gif file does not contain any image";
    return nullptr;
  }
  int num_frames = gif_file->ImageCount;
  int width = gif_file->SWidth;
  int height = gif_file->SHeight;
  int channel = 3;
  uint8* dstdata = allocate_output(num_frames, width, height, channel);
  for (int k = 0; k < num_frames; k++) {
    SavedImage* this_image = &gif_file->SavedImages[k];
    GifImageDesc* img_desc = &this_image->ImageDesc;
    if (img_desc->Left != 0 || img_desc->Top != 0 || img_desc->Width != width ||
        img_desc->Height != height) {
      LOG(ERROR) << "Can't process optimized gif.";
      return nullptr;
    }
    ColorMapObject* color_map = this_image->ImageDesc.ColorMap
                                    ? this_image->ImageDesc.ColorMap
                                    : gif_file->SColorMap;
    uint8* this_dst = dstdata + k * width * channel * height;
    for (int i = 0; i < height; ++i) {
      uint8* p_dst = this_dst + i * width * channel;
      for (int j = 0; j < width; ++j) {
        GifByteType color_index = this_image->RasterBits[i * width + j];
        const GifColorType& gif_color = color_map->Colors[color_index];
        p_dst[j * channel + 0] = gif_color.Red;
        p_dst[j * channel + 1] = gif_color.Green;
        p_dst[j * channel + 2] = gif_color.Blue;
      }
    }
  }
  if (DGifCloseFile(gif_file, &error_code) != GIF_OK) {
    LOG(WARNING) << "Fail to close gif file, reason: "
                 << GifErrorString(error_code);
  }
  return dstdata;
 }
 }  // namespace gif
 }  // namespace tensorflow
--- a/tensorflow/core/lib/gif/gif_io.h
+++ b/tensorflow/core/lib/gif/gif_io.h
@ -0,0 +1,51 @@
 /* Copyright 2015 Google Inc. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // Functions to read and write images in GIF format.
 //
 // The advantage over image/codec/png{enc,dec}ocder.h is that this library
 // supports both 8 and 16 bit images.
 //
 // The decoding routine accepts binary image data as a StringPiece.  These are
 // implicitly constructed from strings or char* so they're completely
 // transparent to the caller.  They're also very cheap to construct so this
 // doesn't introduce any additional overhead.
 //
 // The primary benefit of StringPieces being, in this case, that APIs already
 // returning StringPieces (e.g., Bigtable Scanner) or Cords (e.g., IOBuffer;
 // only when they're flat, though) or protocol buffer fields typed to either of
 // these can be decoded without copying the data into a C++ string.
 #ifndef TENSORFLOW_CORE_LIB_GIF_GIF_IO_H_
 #define TENSORFLOW_CORE_LIB_GIF_GIF_IO_H_
 #include <functional>
 #include <string>
 #include <utility>
 #include <vector>
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
 namespace tensorflow {
 namespace gif {
 uint8* Decode(const void* srcdata, int datasize,
              std::function<uint8*(int, int, int, int)> allocate_output);
 }  // namespace gif
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_LIB_GIF_GIF_IO_H_
--- a/tensorflow/core/lib/gif/testdata/optimized.gif
+++ b/tensorflow/core/lib/gif/testdata/optimized.gif
--- a/tensorflow/core/lib/gif/testdata/scan.gif
+++ b/tensorflow/core/lib/gif/testdata/scan.gif
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@ -739,7 +739,7 @@ REGISTER_OP("Reverse")
    .Input("tensor: T")
    .Input("dims: bool")
    .Output("output: T")
-    .Attr("T: {uint8, int8, int32, bool, half, float, double}")
+    .Attr("T: {uint8, int8, int32, bool, half, float, double, complex64, complex128}")
    .SetShapeFn([](InferenceContext* c) {
      const Shape* input = c->input(0);
      const Shape* dims;
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@ -440,10 +440,27 @@ compression: Compression level.
 contents: 0-D. PNG-encoded image.
 )doc");
 // --------------------------------------------------------------------------
 REGISTER_OP("DecodeGif")
    .Input("contents: string")
    .Output("image: uint8")
    .Doc(R"doc(
 Decode the first frame of a GIF-encoded image to a uint8 tensor.
 GIF with frame or transparency compression are not supported
 convert animated GIF from compressed to uncompressed by:
 convert $src.gif -coalesce $dst.gif
 contents: 0-D.  The GIF-encoded image.
 image: 4-D with shape `[num_frames, height, width, 3]`. RGB order
 )doc");
 // --------------------------------------------------------------------------
 REGISTER_OP("RGBToHSV")
-    .Input("images: float")
+    .Input("images: T")
-    .Output("output: float")
+    .Output("output: T")
    .Attr("T: {float, double} = DT_FLOAT")
    .SetShapeFn(ColorspaceShapeFn)
    .Doc(R"doc(
 Converts one or more images from RGB to HSV.
@ -462,8 +479,9 @@ output: `images` converted to HSV.
 // --------------------------------------------------------------------------
 REGISTER_OP("HSVToRGB")
-    .Input("images: float")
+    .Input("images: T")
-    .Output("output: float")
+    .Output("output: T")
    .Attr("T: {float, double} = DT_FLOAT")
    .SetShapeFn(ColorspaceShapeFn)
    .Doc(R"doc(
 Convert one or more images from HSV to RGB.
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@ -1831,4 +1831,76 @@ b: Another tensor, of same type and shape as `a`.
 product: Pairwise cross product of the vectors in `a` and `b`.
 )doc");
 // --------------------------------------------------------------------------
 REGISTER_OP("Cumsum")
    .Input("x: T")
    .Input("axis: int32")
    .Attr("exclusive: bool = false")
    .Attr("reverse: bool = false")
    .Output("out: T")
    .Attr("T: numbertype")
    .Doc(R"doc(
 Compute the cumulative sum of the tensor `x` along `axis`.
 By default, this op performs an inclusive cumsum, which means that the first
 element of the input is identical to the first element of the output:
 ```prettyprint
 tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
 ```
 By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
 performed instead:
 ```prettyprint
 tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
 ```
 By setting the `reverse` kwarg to `True`, the cumsum is performed in the
 opposite direction:
 ```prettyprint
 tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
 ```
 This is more efficient than using separate `tf.reverse` ops.
 The `reverse` and `exclusive` kwargs can also be combined:
 ```prettyprint
 tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
 ```
 )doc");
 REGISTER_OP("Cumprod")
    .Input("x: T")
    .Input("axis: int32")
    .Attr("exclusive: bool = false")
    .Attr("reverse: bool = false")
    .Output("out: T")
    .Attr("T: numbertype")
    .Doc(R"doc(
 Compute the cumulative product of the tensor `x` along `axis`.
 By default, this op performs an inclusive cumprod, which means that the first
 element of the input is identical to the first element of the output:
 ```prettyprint
 tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
 ```
 By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
 performed instead:
 ```prettyprint
 tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
 ```
 By setting the `reverse` kwarg to `True`, the cumprod is performed in the
 opposite direction:
 ```prettyprint
 tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
 ```
 This is more efficient than using separate `tf.reverse` ops.
 The `reverse` and `exclusive` kwargs can also be combined:
 ```prettyprint
 tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
 ```
 )doc");
 }  // namespace tensorflow
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@ -4342,6 +4342,42 @@ op {
  summary: "Decode a PNG-encoded image to a uint8 or uint16 tensor."
  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the PNG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the PNG-encoded image is transformed to match the requested number\nof color channels."
 }
 op {
  name: "DecodeGif"
  input_arg {
    name: "contents"
    description: "0-D.  The GIF-encoded image."
    type: DT_STRING
  }
  output_arg {
    name: "image"
    description: "3-D with shape `[height, width, channels]`."
    type_attr: "dtype"
  }
  attr {
    name: "channels"
    type: "int"
    default_value {
      i: 0
    }
    description: "Number of color channels for the decoded image."
  }
  attr {
    name: "dtype"
    type: "type"
    default_value {
      type: DT_UINT8
    }
    allowed_values {
      list {
        type: DT_UINT8
        type: DT_UINT16
      }
    }
  }
  summary: "Decode a GIF-encoded image to a uint8 or uint16 tensor."
  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the GIF-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the GIF-encoded image is transformed to match the requested number\nof color channels."
 }
 op {
  name: "DecodeRaw"
  input_arg {
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@ -488,11 +488,13 @@ REGISTER_OP("ApplyMomentum")
    .Output("out: Ref(T)")
    .Attr("T: numbertype")
    .Attr("use_locking: bool = false")
    .Attr("use_nesterov: bool = false")
    .SetShapeFn([](InferenceContext* c) {
      return ApplyMomentumShapeFn(c, false /* sparse */);
    })
    .Doc(R"doc(
-Update '*var' according to the momentum scheme.
+Update '*var' according to the momentum scheme. Set use_nesterov = True if you
 want to use Nesterov momentum.
 accum = accum * momentum + grad
 var -= lr * accum
@ -506,6 +508,9 @@ out: Same as "var".
 use_locking: If `True`, updating of the var and accum tensors will be protected
  by a lock; otherwise the behavior is undefined, but may exhibit less
  contention.
 use_nesterov: If `True`, the tensor passed to compute grad will be 
 var - lr * momentum * accum, so in the end, the var you get is actually
 var - lr * momentum * accum.
 )doc");
 REGISTER_OP("SparseApplyMomentum")
@ -519,11 +524,13 @@ REGISTER_OP("SparseApplyMomentum")
    .Attr("T: numbertype")
    .Attr("Tindices: {int32, int64}")
    .Attr("use_locking: bool = false")
    .Attr("use_nesterov: bool = false")
    .SetShapeFn([](InferenceContext* c) {
      return ApplyMomentumShapeFn(c, true /* sparse */);
    })
    .Doc(R"doc(
 Update relevant entries in '*var' and '*accum' according to the momentum scheme.
 Set use_nesterov = True if you want to use Nesterov momentum.
 That is for rows we have grad for, we update var and accum as follows:
@ -540,6 +547,9 @@ out: Same as "var".
 use_locking: If `True`, updating of the var and accum tensors will be protected
  by a lock; otherwise the behavior is undefined, but may exhibit less
  contention.
 use_nesterov: If `True`, the tensor passed to compute grad will be 
 var - lr * momentum * accum, so in the end, the var you get is actually
 var - lr * momentum * accum.
 )doc");
 static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) {
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@ -57,12 +57,13 @@ cc_library(
    name = "platformlib",
    copts = tf_copts(),
    deps = [
        "//tensorflow/core:protos_cc",
        "@farmhash_archive//:farmhash",
        "@gif_archive//:gif",
        "@highwayhash//:sip_hash",
        "@jpeg_archive//:jpeg",
        "@png_archive//:png",
        "@highwayhash//:sip_hash",
        "@re2//:re2",
        "//tensorflow/core:protos_cc",
    ],
 )
--- a/tensorflow/core/platform/gif.h
+++ b/tensorflow/core/platform/gif.h
@ -0,0 +1,29 @@
 /* Copyright 2015 Google Inc. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_CORE_PLATFORM_GIF_H_
 #define TENSORFLOW_CORE_PLATFORM_GIF_H_
 #include "tensorflow/core/platform/platform.h"
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/gif.h"
 #elif defined(PLATFORM_POSIX) && !defined(IS_MOBILE_PLATFORM)
 #include "giflib-5.1.4/lib/gif_lib.h"
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
 #endif
 #endif  // TENSORFLOW_CORE_PLATFORM_GIF_H_
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
@ -94,8 +94,8 @@ def run_training():
    saver = tf.train.Saver()
    # Create the op for initializing variables.
-    init_op = tf.initialize_all_variables()
+    init_op = tf.group(tf.initialize_all_variables(),
-
+                       tf.initialize_local_variables())
    # Create a session for running Ops on the Graph.
    sess = tf.Session()
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@ -99,8 +99,10 @@ Status ReadTensorFromImageFile(string file_name, const int input_height,
  if (tensorflow::StringPiece(file_name).ends_with(".png")) {
    image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                             DecodePng::Channels(wanted_channels));
  } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
    image_reader = DecodeGif(root.WithOpName("gif_reader"), file_reader);
  } else {
-    // Assume if it's not a PNG then it must be a JPEG.
+    // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
    image_reader = DecodeJpeg(root.WithOpName("jpeg_reader"), file_reader,
                              DecodeJpeg::Channels(wanted_channels));
  }
--- a/tensorflow/examples/skflow/resnet.py
+++ b/tensorflow/examples/skflow/resnet.py
@ -52,13 +52,13 @@ def res_net(x, y, activation=tf.nn.relu):
    Predictions and loss tensors.
  """
-  # Configurations for each bottleneck block.
+  # Configurations for each bottleneck group.
-  BottleneckBlock = namedtuple(
+  BottleneckGroup = namedtuple(
-      'BottleneckBlock', ['num_layers', 'num_filters', 'bottleneck_size'])
+      'BottleneckGroup', ['num_blocks', 'num_filters', 'bottleneck_size'])
-  blocks = [BottleneckBlock(3, 128, 32),
+  groups = [BottleneckGroup(3, 128, 32),
-            BottleneckBlock(3, 256, 64),
+            BottleneckGroup(3, 256, 64),
-            BottleneckBlock(3, 512, 128),
+            BottleneckGroup(3, 512, 128),
-            BottleneckBlock(3, 1024, 256)]
+            BottleneckGroup(3, 1024, 256)]
  input_shape = x.get_shape().as_list()
@ -78,19 +78,19 @@ def res_net(x, y, activation=tf.nn.relu):
  # First chain of resnets
  with tf.variable_scope('conv_layer2'):
-    net = learn.ops.conv2d(net, blocks[0].num_filters,
+    net = learn.ops.conv2d(net, groups[0].num_filters,
                           [1, 1], [1, 1, 1, 1],
                           padding='VALID', bias=True)
-  # Create each bottleneck building block for each layer
+  # Create the bottleneck groups, each of which contains `num_blocks`
-  for block_i, block in enumerate(blocks):
+  # bottleneck groups.
-    for layer_i in range(block.num_layers):
+  for group_i, group in enumerate(groups):
-
+    for block_i in range(group.num_blocks):
-      name = 'block_%d/layer_%d' % (block_i, layer_i)
+      name = 'group_%d/block_%d' % (group_i, block_i)
      # 1x1 convolution responsible for reducing dimension
      with tf.variable_scope(name + '/conv_in'):
-        conv = learn.ops.conv2d(net, block.bottleneck_size,
+        conv = learn.ops.conv2d(net, group.bottleneck_size,
                                [1, 1], [1, 1, 1, 1],
                                padding='VALID',
                                activation=activation,
@ -98,7 +98,7 @@ def res_net(x, y, activation=tf.nn.relu):
                                bias=False)
      with tf.variable_scope(name + '/conv_bottleneck'):
-        conv = learn.ops.conv2d(conv, block.bottleneck_size,
+        conv = learn.ops.conv2d(conv, group.bottleneck_size,
                                [3, 3], [1, 1, 1, 1],
                                padding='SAME',
                                activation=activation,
@ -107,7 +107,8 @@ def res_net(x, y, activation=tf.nn.relu):
      # 1x1 convolution responsible for restoring dimension
      with tf.variable_scope(name + '/conv_out'):
-        conv = learn.ops.conv2d(conv, block.num_filters,
+        input_dim = net.get_shape()[-1].value
        conv = learn.ops.conv2d(conv, input_dim,
                                [1, 1], [1, 1, 1, 1],
                                padding='VALID',
                                activation=activation,
@ -118,16 +119,16 @@ def res_net(x, y, activation=tf.nn.relu):
      # residual function (identity shortcut)
      net = conv + net
-      try:
+    try:
-        # upscale to the next block size
+      # upscale to the next group size
-        next_block = blocks[block_i + 1]
+      next_group = groups[group_i + 1]
-        with tf.variable_scope('block_%d/conv_upscale' % block_i):
+      with tf.variable_scope('block_%d/conv_upscale' % group_i):
-          net = learn.ops.conv2d(net, next_block.num_filters,
+        net = learn.ops.conv2d(net, next_group.num_filters,
-                                 [1, 1], [1, 1, 1, 1],
+                               [1, 1], [1, 1, 1, 1],
-                                 bias=False,
+                               bias=False,
-                                 padding='SAME')
+                               padding='SAME')
-      except IndexError:
+    except IndexError:
-        pass
+      pass
  net_shape = net.get_shape().as_list()
  net = tf.nn.avg_pool(net,
@ -139,18 +140,12 @@ def res_net(x, y, activation=tf.nn.relu):
  return learn.models.logistic_regression(net, y)
 # Download and load MNIST data.
 mnist = input_data.read_data_sets('MNIST_data')
 # Restore model if graph is saved into a folder.
 if os.path.exists('models/resnet/graph.pbtxt'):
  classifier = learn.TensorFlowEstimator.restore('models/resnet/')
 else:
  # Create a new resnet classifier.
  classifier = learn.TensorFlowEstimator(
      model_fn=res_net, n_classes=10, batch_size=100, steps=100,
      learning_rate=0.001, continue_training=True)
 while True:
  # Train model and save summaries into logdir.
@ -161,6 +156,3 @@ while True:
  score = metrics.accuracy_score(
      mnist.test.labels, classifier.predict(mnist.test.images, batch_size=64))
  print('Accuracy: {0:f}'.format(score))
  # Save model graph and checkpoints.
  classifier.save('models/resnet/')
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@ -49,7 +49,7 @@ def train():
  # Create a multilayer model.
-  # Input placehoolders
+  # Input placeholders
  with tf.name_scope('input'):
    x = tf.placeholder(tf.float32, [None, 784], name='x-input')
    y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
--- a/tensorflow/examples/udacity/README.md
+++ b/tensorflow/examples/udacity/README.md
@ -6,7 +6,11 @@ Course information can be found at https://www.udacity.com/course/deep-learning-
 Running the Docker container from the Google Cloud repository
 -------------------------------------------------------------
-    docker run -p 8888:8888 -it b.gcr.io/tensorflow-udacity/assignments:0.5.0
+    docker run -p 8888:8888 --name tensorflow-udacity -it b.gcr.io/tensorflow-udacity/assignments:0.5.0
 Note that if you ever exit the container, you can return to it using:
    docker start -ai tensorflow-udacity
 Accessing the Notebooks
 -----------------------
@ -19,21 +23,6 @@ On mac, find the virtual machine's IP using:
 Then go to: http://IP:8888 (likely http://192.168.99.100:8888)
 Saving Your Progress
 --------------------
 Because of the `--rm` flag above, stopping the docker container removes it, so any changes you've made will disappear. One way around this is to remove the `--rm` flag, and name the container for easy restarting:
 ```sh
 # you only need to "run" the container the first time:
 docker run -p 8888:8888 -it --name tensorflow-udacity b.gcr.io/tensorflow-udacity/assignments:0.5.0
 # …do various things…
 # when you're done, control-C to kill jupyter and stop the container
 # when you're ready to do more things, you can now just "start" the container:
 docker start -ai tensorflow-udacity
 # …do more things…
 # …repeat…
 ```
 FAQ
 ---
--- a/tensorflow/g3doc/get_started/os_setup.md
+++ b/tensorflow/g3doc/get_started/os_setup.md
@ -44,7 +44,7 @@ management system used to install and manage software packages written in
 Python.
 The packages that will be installed or upgraded during the pip install are listed in the
-[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py)
+[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py).
 Install pip (or pip3 for python3) if it is not already installed:
@ -231,7 +231,7 @@ packages needed by TensorFlow.
 Install Anaconda:
-Follow the instructions on the [Anaconda download site](https://www.continuum.io/downloads)
+Follow the instructions on the [Anaconda download site](https://www.continuum.io/downloads).
 Create a conda environment called `tensorflow`:
@ -377,6 +377,8 @@ The option `-p 8888:8888` is used to publish the Docker container᾿s internal p
 The format of the port mapping is `hostPort:containerPort`. You can specify any valid port number for the host port but have to use `8888` for the container port portion.
 If you're using a container with GPU support, some additional flags must be passed to expose the GPU device to the container.
 For NVidia GPU support install latest NVidia drivers and
 [nvidia-docker](https://github.com/NVIDIA/nvidia-docker).
 Run with
@ -385,7 +387,15 @@ Run with
 $ nvidia-docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu
 ```
-For more details see (TensorFlow docker readme)[https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/docker].
+If you have a problem running `nvidia-docker`, then using the default config, we include a
 [script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/docker_run_gpu.sh)
 in the repo with these flags, so the command-line would look like
 ```bash
 $ path/to/repo/tensorflow/tools/docker/docker_run_gpu.sh -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu
 ```
 For more details see [TensorFlow docker readme](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/docker).
 You can now [test your installation](#test-the-tensorflow-installation) within the Docker container.
@ -479,7 +489,7 @@ of tensorflow. If you want to install a specific branch (such as a release branc
 pass `-b <branchname>` to the `git clone` command and `--recurse-submodules` for
 r0.8 and earlier to fetch the protobuf library that TensorFlow depends on.
-### Installation for Linux
+### Prepare environment for Linux
 #### Install Bazel
@ -508,19 +518,6 @@ $ sudo apt-get install python-numpy swig python-dev python-wheel
 $ sudo apt-get install python3-numpy swig python3-dev python3-wheel
 ```
 #### Configure the installation
 Run the `configure` script at the root of the tree.  The configure script
 asks you for the path to your python interpreter and allows (optional)
 configuration of the CUDA libraries (see [below](#configure-tensorflows-canonical-view-of-cuda-libraries)).
 This step is used to locate the python and numpy header files.
 ```bash
 $ ./configure
 Please specify the location of python. [Default is /usr/bin/python]:
 ```
 #### Optional: Install CUDA (GPUs on Linux)
 In order to build or run TensorFlow with GPU support, both NVIDIA's Cuda Toolkit (>= 7.0) and
@ -564,83 +561,7 @@ sudo cp cuda/lib64/libcudnn* /usr/local/cuda/lib64
 sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
 ```
-##### Configure TensorFlow's canonical view of Cuda libraries
+### Prepare environment for Mac OS X
 When running the `configure` script from the root of your source tree, select
 the option `Y` when asked to build TensorFlow with GPU support. If you have
 several versions of Cuda or cuDNN installed, you should definitely select
 one explicitly instead of relying on the system default. You should see
 prompts like the following:
 ``` bash
 $ ./configure
 Please specify the location of python. [Default is /usr/bin/python]:
 Do you wish to build TensorFlow with GPU support? [y/N] y
 GPU support will be enabled for TensorFlow
 Please specify which gcc nvcc should use as the host compiler. [Default is
 /usr/bin/gcc]: /usr/bin/gcc-4.9
 Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave
 empty to use system default]: 7.5
 Please specify the location where CUDA 7.5 toolkit is installed. Refer to
 README.md for more details. [default is: /usr/local/cuda]: /usr/local/cuda
 Please specify the cuDNN version you want to use. [Leave empty to use system
 default]: 4.0.4
 Please specify the location where the cuDNN 4.0.4 library is installed. Refer to
 README.md for more details. [default is: /usr/local/cuda]: /usr/local/cudnn-r4-rc/
 Please specify a list of comma-separated Cuda compute capabilities you want to
 build with. You can find the compute capability of your device at:
 https://developer.nvidia.com/cuda-gpus.
 Please note that each additional compute capability significantly increases your
 build time and binary size. [Default is: \"3.5,5.2\"]: 3.5
 Setting up Cuda include
 Setting up Cuda lib64
 Setting up Cuda bin
 Setting up Cuda nvvm
 Setting up CUPTI include
 Setting up CUPTI lib64
 Configuration finished
 ```
 This creates a canonical set of symbolic links to the Cuda libraries on your system.
 Every time you change the Cuda library paths you need to run this step again before
 you invoke the bazel build command. For the cuDNN libraries, use '6.5' for R2, '7.0'
 for R3, and '4.0.4' for R4-RC.
 ##### Build your target with GPU support
 From the root of your source tree, run:
 ```bash
 $ bazel build -c opt --config=cuda //tensorflow/cc:tutorials_example_trainer
 $ bazel-bin/tensorflow/cc/tutorials_example_trainer --use_gpu
 # Lots of output. This tutorial iteratively calculates the major eigenvalue of
 # a 2x2 matrix, on GPU. The last few lines look like this.
 000009/000005 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
 000006/000001 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
 000009/000009 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
 ```
 Note that "--config=cuda" is needed to enable the GPU support.
 ##### Known issues
 * Although it is possible to build both Cuda and non-Cuda configs under the same
 source tree, we recommend to run `bazel clean` when switching between these two
 configs in the same source tree.
 * You have to run configure before running bazel build. Otherwise, the build
 will fail with a clear error message. In the future, we might consider making
 this more convenient by including the configure step in our build process.
 ### Installation for Mac OS X
 We recommend using [homebrew](http://brew.sh) to install the bazel and SWIG
 dependencies, and installing python dependencies using easy_install or pip.
@ -713,15 +634,20 @@ $ sudo mv lib/libcudnn* /Developer/NVIDIA/CUDA-7.5/lib
 $ sudo ln -s /Developer/NVIDIA/CUDA-7.5/lib/libcudnn* /usr/local/cuda/lib/
 ```
-#### Configure the installation
+### Configure the installation
 Run the `configure` script at the root of the tree.  The configure script
-asks you for the path to your python interpreter.
+asks you for the path to your python interpreter and allows (optional)
 configuration of the CUDA libraries.
 This step is used to locate the python and numpy header files as well as
-enabling GPU support if you have a CUDA enabled GPU and Toolkit installed. For
+enabling GPU support if you have a CUDA enabled GPU and Toolkit installed.
-example:
+Select the option `Y` when asked to build TensorFlow with GPU support.
 If you have several versions of Cuda or cuDNN installed, you should definitely
 select one explicitly instead of relying on the system default.
 For example:
 ```bash
 $ ./configure
@ -748,6 +674,38 @@ Setting up CUPTI lib64
 Configuration finished
 ```
 This creates a canonical set of symbolic links to the Cuda libraries on your system.
 Every time you change the Cuda library paths you need to run this step again before
 you invoke the bazel build command. For the cuDNN libraries, use '6.5' for R2, '7.0'
 for R3, and '4.0.4' for R4-RC.
 #### Build your target with GPU support
 From the root of your source tree, run:
 ```bash
 $ bazel build -c opt --config=cuda //tensorflow/cc:tutorials_example_trainer
 $ bazel-bin/tensorflow/cc/tutorials_example_trainer --use_gpu
 # Lots of output. This tutorial iteratively calculates the major eigenvalue of
 # a 2x2 matrix, on GPU. The last few lines look like this.
 000009/000005 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
 000006/000001 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
 000009/000009 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
 ```
 Note that "--config=cuda" is needed to enable the GPU support.
 #### Known issues
 * Although it is possible to build both Cuda and non-Cuda configs under the same
 source tree, we recommend to run `bazel clean` when switching between these two
 configs in the same source tree.
 * You have to run configure before running bazel build. Otherwise, the build
 will fail with a clear error message. In the future, we might consider making
 this more convenient by including the configure step in our build process.
 ### Create the pip package and install
 When building from source, you will still build a pip package and install that.
--- a/tensorflow/g3doc/how_tos/image_retraining/index.md
+++ b/tensorflow/g3doc/how_tos/image_retraining/index.md
@ -131,7 +131,7 @@ Once TensorBoard is running, navigate your web browser to `localhost:6006` to vi
 The script will log TensorBoard summaries to `/tmp/retrain_logs` by default. You can change the directory with the `--summaries_dir` flag.
-The [TensorBoard README](../../../tensorboard/README.md) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.
+The [TensorBoard README](https://www.tensorflow.org/code/tensorflow/tensorboard/README.md) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.
 ## Using the Retrained Model
--- a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
+++ b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
@ -8,7 +8,7 @@ your TensorFlow graph, plot quantitative metrics about the execution of your
 graph, and show additional data like images that pass through it. When
 TensorBoard is fully configured, it looks like this:
-[![MNIST TensorBoard](../../images/mnist_tensorboard.png "MNIST TensorBoard")](http://tensorflow.org/tensorboard)
+[![MNIST TensorBoard](../../images/mnist_tensorboard.png "MNIST TensorBoard")](http://tensorflow.org/tensorboard)  
 [*Click try a TensorBoard with data from this tutorial!*](http://tensorflow.org/tensorboard)
 This tutorial is intended to get you started with simple TensorBoard usage.
--- a/tensorflow/g3doc/resources/index.md
+++ b/tensorflow/g3doc/resources/index.md
@ -37,6 +37,7 @@ The TensorFlow community has created many great projects around TensorFlow, incl
 * [TensorFlow tutorials](https://github.com/pkmital/tensorflow_tutorials)
 * [Scikit Flow - Simplified Interface for TensorFlow](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/learn/python/learn)
 * [Caffe to TensorFlow model converter](https://github.com/ethereon/caffe-tensorflow)
 * [Bitfusion's` GPU-enabled AWS EC2 TensorFlow AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-tensorflow) ([Launch AMI](https://aws.amazon.com/marketplace/pp/B01EYKBEQ0))
 ### Development
--- a/tensorflow/g3doc/tutorials/mnist/pros/index.md
+++ b/tensorflow/g3doc/tutorials/mnist/pros/index.md
@ -190,11 +190,11 @@ accomplished by repeatedly running `train_step`.
 ```python
 for i in range(1000):
-  batch = mnist.train.next_batch(50)
+  batch = mnist.train.next_batch(100)
  train_step.run(feed_dict={x: batch[0], y_: batch[1]})
 ```
-Each training iteration we load 50 training examples. We then run the
+Each training iteration we load 100 training examples. We then run the
 `train_step` operation, using `feed_dict` to replace the `placeholder` tensors
 `x` and `y_` with the training examples.
 Note that you can replace any tensor in your computation graph using `feed_dict`
--- a/tensorflow/g3doc/tutorials/recurrent/index.md
+++ b/tensorflow/g3doc/tutorials/recurrent/index.md
@ -178,6 +178,7 @@ https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/get_starte
 [bazel](https://github.com/bazelbuild/bazel)).
 Next:
 ```bash
 cd tensorflow/models/rnn/ptb
 python ptb_word_lm.py --data_path=/tmp/simple-examples/data/ --model small
--- a/tensorflow/g3doc/tutorials/tflearn/index.md
+++ b/tensorflow/g3doc/tutorials/tflearn/index.md
@ -240,10 +240,11 @@ second sample is *Iris virginica*.
 * For further reference materials on tf.contrib.learn, see the official
 [API docs](../../api_docs/python/contrib.learn.md).
 <!-- David, will the below be live when this tutorial is released? -->
 * To learn more about using tf.contrib.learn to create linear models, see 
 [Large-scale Linear Models with TensorFlow](../linear/).
 * To build your own Estimator using tf.contrib.learn APIs, check out [Building Machine Learning Estimator in TensorFlow](http://terrytangyuan.github.io/2016/07/08/understand-and-build-tensorflow-estimator/).
 * To experiment with neural network modeling and visualization in the browser,
 check out [Deep Playground](http://playground.tensorflow.org/).
--- a/tensorflow/models/embedding/word2vec.py
+++ b/tensorflow/models/embedding/word2vec.py
@ -378,7 +378,8 @@ class Word2Vec(object):
    opts = self._options
    with open(os.path.join(opts.save_path, "vocab.txt"), "w") as f:
      for i in xrange(opts.vocab_size):
-        f.write("%s %d\n" % (tf.compat.as_text(opts.vocab_words[i]),
+        vocab_word = tf.compat.as_text(opts.vocab_words[i]).encode("utf-8")
        f.write("%s %d\n" % (vocab_word,
                             opts.vocab_counts[i]))
  def _train_thread_body(self):
--- a/tensorflow/models/image/mnist/convolutional.py
+++ b/tensorflow/models/image/mnist/convolutional.py
@ -82,10 +82,10 @@ def extract_data(filename, num_images):
  print('Extracting', filename)
  with gzip.open(filename) as bytestream:
    bytestream.read(16)
-    buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images)
+    buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS)
    data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32)
    data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH
-    data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, 1)
+    data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)
    return data
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@ -146,6 +146,7 @@ cuda_py_tests(
        "reverse_sequence_op_test.py",
        "rnn_cell_test.py",
        "scalar_strict_test.py",
        "scan_ops_test.py",
        "session_ops_test.py",
        "shape_ops_test.py",
        "softmax_op_test.py",
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@ -198,14 +198,19 @@ class ReverseTest(test_util.TensorFlowTestCase):
        x_tf = array_ops.reverse(x_np, []).eval()
        self.assertAllEqual(x_tf, x_np)
-  def testReverse1DimAuto(self):
+  def _reverse1DimAuto(self, np_dtype):
-    x_np = [1, 4, 9]
+    x_np = np.array([1, 2, 3, 4, 5], dtype=np_dtype)
    for use_gpu in [False, True]:
      with self.test_session(use_gpu=use_gpu):
        x_tf = array_ops.reverse(x_np, [True]).eval()
        self.assertAllEqual(x_tf, np.asarray(x_np)[::-1])
  def testReverse1DimAuto(self):
    for dtype in [np.uint8, np.int8, np.int32, np.bool, np.float16,
                  np.float32, np.float64, np.complex64, np.complex128]:
      self._reverse1DimAuto(dtype)
  def testUnknownDims(self):
    data_t = tf.placeholder(tf.float32)
    dims_known_t = tf.placeholder(tf.bool, shape=[3])
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@ -432,16 +432,13 @@ class ProdReductionTest(tf.test.TestCase):
    self._compareAll(np_arr, [0, 2])
    self._compareAll(np_arr, [0, 1, 2])
-  def testGradient(self):
+  def _compareGradient(self, x):
    s = [2, 3, 4, 2]
    # NOTE(kearnes): divide by 20 so product is a reasonable size
    x = np.arange(1.0, 49.0).reshape(s).astype(np.float32) / 20.
    with self.test_session():
      t = tf.convert_to_tensor(x)
      su = tf.reduce_prod(t, [])
      jacob_t, jacob_n = tf.test.compute_gradient(t,
-                                                  s,
+                                                  x.shape,
                                                  su,
                                                  [2, 3, 4, 2],
                                                  x_init_value=x,
@ -450,7 +447,7 @@ class ProdReductionTest(tf.test.TestCase):
      su = tf.reduce_prod(t, [1, 2])
      jacob_t, jacob_n = tf.test.compute_gradient(t,
-                                                  s,
+                                                  x.shape,
                                                  su,
                                                  [2, 2],
                                                  x_init_value=x,
@ -459,26 +456,34 @@ class ProdReductionTest(tf.test.TestCase):
      su = tf.reduce_prod(t, [0, 1, 2, 3])
      jacob_t, jacob_n = tf.test.compute_gradient(t,
-                                                  s,
+                                                  x.shape,
                                                  su,
                                                  [1],
                                                  x_init_value=x,
                                                  delta=1)
      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
-    # NOTE(kearnes): the current gradient calculation gives NaNs for 0 inputs
+  def testGradientWithZeros(self):
-    x = np.arange(0.0, 48.0).reshape(s).astype(np.float32) / 20.
+    s = [2, 3, 4, 2]
-    with self.test_session():
+    x = np.arange(1.0, 49.0).reshape(s).astype(np.float32) / 20.
-      t = tf.convert_to_tensor(x)
+    # No zeros in input
-      su = tf.reduce_prod(t, [])
+    self._compareGradient(x)
-      jacob_t, _ = tf.test.compute_gradient(t,
+    # Zero at beginning
-                                            s,
+    x1 = x.copy()
-                                            su,
+    x1[:,:,0,:] = 0
-                                            [2, 3, 4, 2],
+    self._compareGradient(x1)
-                                            x_init_value=x,
+    # Zero at end
-                                            delta=1)
+    x2 = x.copy()
-      with self.assertRaisesOpError("Tensor had NaN values"):
+    x2[:,:,-1,:] = 0
-        tf.check_numerics(jacob_t, message="_ProdGrad NaN test").op.run()
+    self._compareGradient(x2)
    # Zero in middle
    x3 = x.copy()
    x3[:,:,2,:] = 0
    self._compareGradient(x3)
    # All zeros
    x4 = x.copy()
    x4[:,:,:,:] = 0
    self._compareGradient(x4)
  def testEmptyGradients(self):
    with self.test_session():
--- a/tensorflow/python/kernel_tests/scan_ops_test.py
+++ b/tensorflow/python/kernel_tests/scan_ops_test.py
@ -0,0 +1,229 @@
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Functional tests for scan ops."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from itertools import combinations
 import numpy as np
 import tensorflow as tf
 def numpy_reverse(x, axis):
  ix = [slice(None, None, -1)
        if i == axis else slice(None) for i in range(len(x.shape))]
  return x[ix]
 def handle_options(func, x, axis, exclusive, reverse):
  """Adds tf options to numpy scan ops"""
  if reverse:
    x = numpy_reverse(x, axis)
  if exclusive:
    ix_head = [slice(0, 1) if i == axis else slice(None)
                 for i in range(len(x.shape))]
    ix_init = [slice(0, -1) if i == axis else slice(None)
                 for i in range(len(x.shape))]
    if func == np.cumsum:
      init = np.zeros_like(x[ix_head])
    elif func == np.cumprod:
      init = np.ones_like(x[ix_head])
    else:
      raise ValueError("Unknown scan function")
    x = np.concatenate([init, func(x[ix_init], axis)], axis=axis)
  else:
    x = func(x, axis=axis)
  if reverse:
    x = numpy_reverse(x, axis)
  return x
 class CumsumTest(tf.test.TestCase):
  valid_dtypes = [np.int32, np.int64, np.float16, np.float32,
                  np.float64, np.complex64, np.complex128]
  def _compare(self, x, axis, exclusive, reverse, use_gpu=False):
    np_out = handle_options(np.cumsum, x, axis, exclusive, reverse)
    with self.test_session(use_gpu=use_gpu):
      tf_out = tf.cumsum(x, axis, exclusive, reverse).eval()
    self.assertAllClose(np_out, tf_out)
  def _compareAll(self, x, axis):
    for exclusive in [True, False]:
      for reverse in [True, False]:
        for use_gpu in [True, False]:
          self._compare(x, axis, exclusive, reverse, use_gpu)
  def test1D(self):
    for dtype in self.valid_dtypes:
      x = np.arange(1, 6).reshape([5]).astype(dtype)
      self._compareAll(x, 0)
  def test2D(self):
    for dtype in self.valid_dtypes:
      x = np.arange(0, 10).reshape([2, 5]).astype(dtype)
      self._compareAll(x, 0)
      self._compareAll(x, 1)
  def test3D(self):
    for dtype in self.valid_dtypes:
      x = np.arange(0, 20).reshape([2, 2, 5]).astype(dtype)
      self._compareAll(x, 0)
      self._compareAll(x, 1)
      self._compareAll(x, 2)
  def testInvalidAxis(self):
    x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
    input_tensor = tf.convert_to_tensor(x)
    with self.test_session():
      with self.assertRaisesWithPredicateMatch(
          tf.errors.InvalidArgumentError,
          lambda e: "Expected scan axis in the range" in str(e)):
        tf.cumsum(input_tensor, -1).eval()
      with self.assertRaisesWithPredicateMatch(
          tf.errors.InvalidArgumentError,
          lambda e: "Expected scan axis in the range" in str(e)):
        tf.cumsum(input_tensor, 2).eval()
      with self.assertRaisesWithPredicateMatch(
          tf.errors.InvalidArgumentError,
          lambda e: "axis must be a scalar" in str(e)):
        tf.cumsum(input_tensor, [0]).eval()
  def _compareGradient(self, shape, axis, exclusive, reverse):
    x = np.arange(0, 50).reshape(shape).astype(np.float64)
    with self.test_session():
      t = tf.convert_to_tensor(x)
      result = tf.cumsum(t, axis, exclusive, reverse)
      jacob_t, jacob_n = tf.test.compute_gradient(t,
                                                  shape,
                                                  result,
                                                  shape,
                                                  x_init_value=x,
                                                  delta=1)
    self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
  def testGradient(self):
    self._compareGradient([50], 0, False, False)
  def testGradientReverse(self):
    self._compareGradient([50], 0, False, True)
  def testGradientExclusive(self):
    self._compareGradient([50], 0, True, False)
  def testGradientExclusiveReverse(self):
    self._compareGradient([50], 0, True, True)
  def testGradient2D(self):
    for axis in [0, 1]:
      for exclusive in [True, False]:
        for reverse in [True, False]:
          self._compareGradient([5, 10], axis, exclusive, reverse)
 class CumprodTest(tf.test.TestCase):
  valid_dtypes = [np.int32, np.int64, np.float16, np.float32,
                  np.float64, np.complex64, np.complex128]
  def _compare(self, x, axis, exclusive, reverse, use_gpu=False):
    np_out = handle_options(np.cumprod, x, axis, exclusive, reverse)
    with self.test_session(use_gpu=use_gpu):
      tf_out = tf.cumprod(x, axis, exclusive, reverse).eval()
    self.assertAllClose(np_out, tf_out)
  def _compareAll(self, x, axis):
    for exclusive in [True, False]:
      for reverse in [True, False]:
        for use_gpu in [True, False]:
          self._compare(x, axis, exclusive, reverse, use_gpu)
  def test1D(self):
    for dtype in self.valid_dtypes:
      x = np.arange(1, 6).reshape([5]).astype(dtype)
      self._compareAll(x, 0)
  def test2D(self):
    for dtype in self.valid_dtypes:
      x = np.arange(1, 11).reshape([2, 5]).astype(dtype)
      self._compareAll(x, 0)
      self._compareAll(x, 1)
  def test3D(self):
    for dtype in self.valid_dtypes:
      x = np.arange(1, 21).reshape([2, 2, 5]).astype(dtype)
      self._compareAll(x, 0)
      self._compareAll(x, 1)
      self._compareAll(x, 2)
  def testInvalidAxis(self):
    x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
    input_tensor = tf.convert_to_tensor(x)
    with self.test_session():
      with self.assertRaisesWithPredicateMatch(
          tf.errors.InvalidArgumentError,
          lambda e: "Expected scan axis in the range" in str(e)):
        tf.cumprod(input_tensor, -1).eval()
      with self.assertRaisesWithPredicateMatch(
          tf.errors.InvalidArgumentError,
          lambda e: "Expected scan axis in the range" in str(e)):
        tf.cumprod(input_tensor, 2).eval()
      with self.assertRaisesWithPredicateMatch(
          tf.errors.InvalidArgumentError,
          lambda e: "axis must be a scalar" in str(e)):
        tf.cumprod(input_tensor, [0]).eval()
  def _compareGradient(self, shape, axis, exclusive, reverse):
    x = np.arange(1, 9).reshape(shape).astype(np.float64)
    with self.test_session():
      t = tf.convert_to_tensor(x)
      result = tf.cumprod(t, axis, exclusive, reverse)
      jacob_t, jacob_n = tf.test.compute_gradient(t,
                                                  shape,
                                                  result,
                                                  shape,
                                                  x_init_value=x,
                                                  delta=1)
    self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
  def testGradient(self):
    self._compareGradient([8], 0, False, False)
  def testGradientReverse(self):
    self._compareGradient([8], 0, False, True)
  def testGradientExclusive(self):
    self._compareGradient([8], 0, True, False)
  def testGradientExclusiveReverse(self):
    self._compareGradient([8], 0, True, True)
  def testGradient2D(self):
    for axis in [0, 1]:
      for exclusive in [True, False]:
        for reverse in [True, False]:
          self._compareGradient([2, 4], axis, exclusive, reverse)
 if __name__ == "__main__":
  tf.test.main()
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@ -1021,6 +1021,12 @@ def _ResizeShape(op):
  return [tensor_shape.TensorShape(
      [input_shape[0], height, width, input_shape[3]])]
@ops.RegisterShape('DecodeGif')
 def _ImageDecodeShape(op):
  """Shape function for decode gif."""
  unused_input_shape = op.inputs[0].get_shape().merge_with(
      tensor_shape.scalar())
  return [tensor_shape.TensorShape([None, None, None, 3])]
@ops.RegisterShape('DecodeJpeg')
@ops.RegisterShape('DecodePng')
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@ -27,6 +27,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@ -42,34 +43,37 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
    np.random.seed(7)
    batch_size = 5
    shape = (batch_size, 2, 7, 3)
    inp = np.random.rand(*shape).astype(np.float32)
-    # Convert to HSV and back, as a batch and individually
+    for nptype in [np.float32, np.float64]:
-    with self.test_session() as sess:
+      inp = np.random.rand(*shape).astype(nptype)
      batch0 = constant_op.constant(inp)
      batch1 = image_ops.rgb_to_hsv(batch0)
      batch2 = image_ops.hsv_to_rgb(batch1)
      split0 = array_ops.unpack(batch0)
      split1 = list(map(image_ops.rgb_to_hsv, split0))
      split2 = list(map(image_ops.hsv_to_rgb, split1))
      join1 = array_ops.pack(split1)
      join2 = array_ops.pack(split2)
      batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
-    # Verify that processing batch elements together is the same as separate
+      # Convert to HSV and back, as a batch and individually
-    self.assertAllClose(batch1, join1)
+      with self.test_session() as sess:
-    self.assertAllClose(batch2, join2)
+        batch0 = constant_op.constant(inp)
-    self.assertAllClose(batch2, inp)
+        batch1 = image_ops.rgb_to_hsv(batch0)
        batch2 = image_ops.hsv_to_rgb(batch1)
        split0 = array_ops.unpack(batch0)
        split1 = list(map(image_ops.rgb_to_hsv, split0))
        split2 = list(map(image_ops.hsv_to_rgb, split1))
        join1 = array_ops.pack(split1)
        join2 = array_ops.pack(split2)
        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
      # Verify that processing batch elements together is the same as separate
      self.assertAllClose(batch1, join1)
      self.assertAllClose(batch2, join2)
      self.assertAllClose(batch2, inp)
  def testRGBToHSVRoundTrip(self):
    data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
-    rgb_np = np.array(data, dtype=np.float32).reshape([2, 2, 3]) / 255.
+    for nptype in [np.float32, np.float64]:
-    for use_gpu in [True, False]:
+      rgb_np = np.array(data, dtype=nptype).reshape([2, 2, 3]) / 255.
-      with self.test_session(use_gpu=use_gpu):
+      for use_gpu in [True, False]:
-        hsv = image_ops.rgb_to_hsv(rgb_np)
+        with self.test_session(use_gpu=use_gpu):
-        rgb = image_ops.hsv_to_rgb(hsv)
+          hsv = image_ops.rgb_to_hsv(rgb_np)
-        rgb_tf = rgb.eval()
+          rgb = image_ops.hsv_to_rgb(hsv)
-    self.assertAllClose(rgb_tf, rgb_np)
+          rgb_tf = rgb.eval()
      self.assertAllClose(rgb_tf, rgb_np)
 class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
@ -1609,6 +1613,56 @@ class PngTest(test_util.TensorFlowTestCase):
                         [None, None, channels or None])
 class GifTest(test_util.TensorFlowTestCase):
  def testValid(self):
    # Read some real GIFs
    prefix = 'tensorflow/core/lib/gif/testdata/'
    filename = 'scan.gif'
    WIDTH = 20
    HEIGHT = 40
    STRIDE = 5
    shape = (12, HEIGHT, WIDTH, 3)
    with self.test_session() as sess:
      gif0 = io_ops.read_file(prefix + filename)
      image0 = image_ops.decode_gif(gif0)
      gif0, image0 = sess.run([gif0, image0])
      self.assertEqual(image0.shape, shape)
      for frame_idx, frame in enumerate(image0):
        gt = np.zeros(shape[1:], dtype=np.uint8)
        start = frame_idx * STRIDE
        end = (frame_idx + 1) * STRIDE
        print(frame_idx)
        if end <= WIDTH:
          gt[:, start:end, :] = 255
        else:
          start -= WIDTH
          end -= WIDTH
          gt[start:end, :, :] = 255
        self.assertAllClose(frame, gt)
  def testInValid(self):
    # Read some real GIFs
    prefix = 'tensorflow/core/lib/gif/testdata/'
    filename = 'optimized.gif'
    with self.test_session() as sess:
      gif0 = io_ops.read_file(prefix + filename)
      image0 = image_ops.decode_gif(gif0)
      with self.assertRaises(errors.InvalidArgumentError):
        gif0, image0 = sess.run([gif0, image0])
  def testShape(self):
      with self.test_session() as sess:
        gif = constant_op.constant('nonsense')
        image = image_ops.decode_gif(gif)
        self.assertEqual(image.get_shape().as_list(),
                [None, None, None, 3])
 class ConvertImageTest(test_util.TensorFlowTestCase):
  def _convert(self, original, original_dtype, output_dtype, expected):
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@ -109,13 +109,41 @@ def _MeanGrad(op, grad):
@ops.RegisterGradient("Prod")
 def _ProdGrad(op, grad):
  """Gradient for Prod."""
-  # TODO(kearnes): this gives NaNs for 0s in the input tensor
+  # The gradient can be expressed by dividing the product by each entry of the
  # input tensor, but this approach can't deal with zeros in the input.
  # Here, we avoid this problem by composing the output as a product of two
  # cumprod operations.
  input_shape = array_ops.shape(op.inputs[0])
  # Expand grad to full input shape
  output_shape_kept_dims = math_ops.reduced_shape(input_shape, op.inputs[1])
  tile_scaling = _safe_shape_div(input_shape, output_shape_kept_dims)
-  grad = array_ops.reshape(grad * op.outputs[0], output_shape_kept_dims)
+  grad = array_ops.reshape(grad, output_shape_kept_dims)
-  grad = math_ops.div(array_ops.tile(grad, tile_scaling), op.inputs[0])
+  grad = array_ops.tile(grad, tile_scaling)
-  return grad, None
+
  # Pack all reduced dimensions into a single one, so we can perform the
  # cumprod ops. If the reduction dims list is empty, it defaults to float32,
  # so we need to cast here.
  reduced = math_ops.cast(op.inputs[1], dtypes.int32)
  idx = math_ops.range(0, array_ops.rank(op.inputs[0]))
  other, _ = array_ops.listdiff(idx, reduced)
  perm = array_ops.concat(0, [reduced, other])
  reduced_num = math_ops.reduce_prod(array_ops.gather(input_shape, reduced))
  other_num = math_ops.reduce_prod(array_ops.gather(input_shape, other))
  permuted = array_ops.transpose(op.inputs[0], perm)
  permuted_shape = array_ops.shape(permuted)
  reshaped = array_ops.reshape(permuted, (reduced_num, other_num))
  # Calculate product, leaving out the current entry
  left = math_ops.cumprod(reshaped, axis=0, exclusive=True)
  right = math_ops.cumprod(reshaped, axis=0, exclusive=True, reverse=True)
  y = array_ops.reshape(left * right, permuted_shape)
  # Invert the transpose and reshape operations.
  # Make sure to set the statically known shape information through a reshape.
  out = grad * array_ops.transpose(y, array_ops.invert_permutation(perm))
  return array_ops.reshape(out, input_shape), None
@ops.RegisterGradient("SegmentSum")
@ -839,3 +867,26 @@ def _CrossGrad(op, grad):
  u = op.inputs[0]
  v = op.inputs[1]
  return (math_ops.cross(v, grad), math_ops.cross(grad, u))
@ops.RegisterGradient("Cumsum")
 def _CumsumGrad(op, grad):
  axis = op.inputs[1]
  exclusive = op.get_attr("exclusive")
  reverse = op.get_attr("reverse")
  return [math_ops.cumsum(grad, axis, exclusive=exclusive,
                          reverse=not reverse), None]
@ops.RegisterGradient("Cumprod")
 def _CumprodGrad(op, grad):
  x = op.inputs[0]
  axis = op.inputs[1]
  exclusive = op.get_attr("exclusive")
  reverse = op.get_attr("reverse")
  # TODO This fails when x contains 0 and should be fixed
  prod = math_ops.cumprod(x, axis, exclusive=exclusive, reverse=reverse)
  out = math_ops.cumsum(prod * grad, axis, exclusive=exclusive,
                        reverse=not reverse)
  return [out / x, None]
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@ -13,7 +13,10 @@
 # limitations under the License.
 # ==============================================================================
-"""## Arithmetic Operators
+"""Note: Elementwise binary operations in TensorFlow follow [numpy-style
 broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
 ## Arithmetic Operators
 TensorFlow provides several operations that you can use to add basic arithmetic
 operators to your graph.
@ -145,6 +148,14 @@ common math computations that reduce various dimensions of a tensor.
@@accumulate_n
 ## Scan
 TensorFlow provides several operations that you can use to perform scans
 (running totals) across one axis of a tensor.
@@cumsum
@@cumprod
 ## Segmentation
 TensorFlow provides several operations that you can use to perform common
@ -1585,6 +1596,94 @@ def tanh(x, name=None):
      return gen_math_ops._tanh(x, name=name)
 def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
    """Compute the cumulative sum of the tensor `x` along `axis`.
    By default, this op performs an inclusive cumsum, which means that the first
    element of the input is identical to the first element of the output:
    ```prettyprint
    tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
    ```
    By setting the `exclusive` kwarg to `True`, an exclusive cumsum is performed
    instead:
    ```prettyprint
    tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
    ```
    By setting the `reverse` kwarg to `True`, the cumsum is performed in the
    opposite direction:
    ```prettyprint
    tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
    ```
    This is more efficient than using separate `tf.reverse` ops.
    The `reverse` and `exclusive` kwargs can also be combined:
    ```prettyprint
    tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
    ```
    Args:
      x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
       `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
       `complex128`, `qint8`, `quint8`, `qint32`, `half`.
      axis: A `Tensor` of type `int32` (default: 0).
      reverse: A `bool` (default: False).
      name: A name for the operation (optional).
    Returns:
      A `Tensor`. Has the same type as `x`.
    """
    with ops.op_scope([x], name, "Cumsum") as name:
      x = ops.convert_to_tensor(x, name="x")
      return gen_math_ops.cumsum(x, axis, exclusive=exclusive,
                                 reverse=reverse, name=name)
 def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
    """Compute the cumulative product of the tensor `x` along `axis`.
    By default, this op performs an inclusive cumprod, which means that the first
    element of the input is identical to the first element of the output:
    ```prettyprint
    tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
    ```
    By setting the `exclusive` kwarg to `True`, an exclusive cumprod is performed
    instead:
    ```prettyprint
    tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
    ```
    By setting the `reverse` kwarg to `True`, the cumprod is performed in the
    opposite direction:
    ```prettyprint
    tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
    ```
    This is more efficient than using separate `tf.reverse` ops.
    The `reverse` and `exclusive` kwargs can also be combined:
    ```prettyprint
    tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
    ```
    Args:
      x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
       `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
       `complex128`, `qint8`, `quint8`, `qint32`, `half`.
      axis: A `Tensor` of type `int32` (default: 0).
      reverse: A `bool` (default: False).
      name: A name for the operation (optional).
    Returns:
      A `Tensor`. Has the same type as `x`.
    """
    with ops.op_scope([x], name, "Cumprod") as name:
      x = ops.convert_to_tensor(x, name="x")
      return gen_math_ops.cumprod(x, axis, exclusive=exclusive,
                                  reverse=reverse, name=name)
 ops.RegisterShape("Abs")(common_shapes.unchanged_shape)
 ops.RegisterShape("Acos")(common_shapes.unchanged_shape)
 ops.RegisterShape("Asin")(common_shapes.unchanged_shape)
@ -1632,6 +1731,8 @@ ops.RegisterShape("BatchFFT3D")(common_shapes.unchanged_shape)
 ops.RegisterShape("BatchIFFT3D")(common_shapes.unchanged_shape)
 ops.RegisterShape("TanhGrad")(common_shapes.unchanged_shape)
 ops.RegisterShape("SigmoidGrad")(common_shapes.unchanged_shape)
 ops.RegisterShape("Cumsum")(common_shapes.unchanged_shape)
 ops.RegisterShape("Cumprod")(common_shapes.unchanged_shape)
@ops.RegisterShape("Add")
--- a/tensorflow/python/ops/rnn_cell.py
+++ b/tensorflow/python/ops/rnn_cell.py
@ -648,7 +648,7 @@ class DropoutWrapper(RNNCell):
                       % input_keep_prob)
    if (isinstance(output_keep_prob, float) and
        not (output_keep_prob >= 0.0 and output_keep_prob <= 1.0)):
-      raise ValueError("Parameter input_keep_prob must be between 0 and 1: %d"
+      raise ValueError("Parameter output_keep_prob must be between 0 and 1: %d"
                       % output_keep_prob)
    self._cell = cell
    self._input_keep_prob = input_keep_prob
--- a/tensorflow/python/platform/default/init.py
+++ b/tensorflow/python/platform/default/init.py
--- a/tensorflow/python/platform/gfile.py
+++ b/tensorflow/python/platform/gfile.py
@ -395,13 +395,14 @@ def Walk(top, topdown=1, onerror=None):
  optional argument "onerror" is specified, it should be a function.  It
  will be called with one argument, an os.error instance.  It can return
  to continue with the walk, or reraise the exception to abort the walk.
  By default, the walk follows symlinks that resolve into directories.
  Yields:
    # Each yield is a 3-tuple:  the pathname of a directory, followed
    # by lists of all its subdirectories and leaf files.
    (dirname, [subdirname, subdirname, ...], [filename, filename, ...])
  """
-  return os.walk(top, topdown=topdown, onerror=onerror)
+  return os.walk(top, topdown=topdown, onerror=onerror, followlinks=True)
 def Stat(path):   # pylint: disable=invalid-name
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@ -92,7 +92,7 @@ def input_producer(input_tensor, element_shape=None, num_epochs=None,
  """Output the rows of `input_tensor` to a queue for an input pipeline.
  Args:
-    input_tensor: A tensor with the rows to produce. Must be at
+    input_tensor: A tensor with the rows to produce. Must be at least
      one-dimensional. Must either have a fully-defined shape, or
      `element_shape` must be defined.
    element_shape: (Optional.) A `TensorShape` representing the shape of a
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import control_flow_ops
@ -40,7 +41,7 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
                          decay_rate ^ (global_step / decay_steps)
  ```
-  If the argument `staircase` is `True`, then `global_step /decay_steps` is an
+  If the argument `staircase` is `True`, then `global_step / decay_steps` is an
  integer division and the decayed learning rate follows a staircase function.
  Example: decay every 100000 steps with a base of 0.96:
@ -67,15 +68,16 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
      Must be positive.  See the decay computation above.
    decay_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The decay rate.
-    staircase: Boolean.  It `True` decay the learning rate at discrete intervals.
+    staircase: Boolean.  It `True` decay the learning rate at discrete intervals
-    name: String.  Optional name of the operation.  Defaults to 'ExponentialDecay'
+    name: String.  Optional name of the operation.  Defaults to 
      'ExponentialDecay'
  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.
  """
  with ops.op_scope([learning_rate, global_step, decay_steps, decay_rate],
-                   name, "ExponentialDecay") as name:
+                    name, "ExponentialDecay") as name:
    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    dtype = learning_rate.dtype
    global_step = math_ops.cast(global_step, dtype)
@ -89,19 +91,19 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
 def piecewise_constant(x, boundaries, values, name=None):
  """ Piecewise constant from boundaries and interval values.
-  
+
  Example: use a learning rate that's 1.0 for the first 100000 steps, 0.5
    for steps 100001 to 110000, and 0.1 for any additional steps.
-  
+
  ```python
  global_step = tf.Variable(0, trainable=False)
  boundaries = [100000, 110000]
  values = [1.0, 0.5, 0.1]
  learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
-  
+
  # Later, whenever we perform an optimization step, we increment global_step.
  ```
-  
+
  Args:
    x: A 0-D scalar `Tensor`. Must be one of the following types: `float32`,
      `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
@ -112,13 +114,13 @@ def piecewise_constant(x, boundaries, values, name=None):
      than `boundaries`, and all elements should have the same type.
    name: A string. Optional name of the operation. Defaults to
      'PiecewiseConstant'.
-  
+
  Returns:
    A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`,
    `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ...,
    and values[-1] when `x > boundaries[-1]`.
  """
-  
+
  with ops.op_scope([x, boundaries, values, name],
                    name, 'PiecewiseConstant') as name:
    x = ops.convert_to_tensor(x)
@ -131,7 +133,7 @@ def piecewise_constant(x, boundaries, values, name=None):
    values = ops.convert_n_to_tensor(values)
    if not all(v.dtype == values[0].dtype for v in values):
      raise ValueError('values must have elements all with the same dtype.')
-    
+
    pred_fn_pairs = {}
    pred_fn_pairs[x <= boundaries[0]] = lambda: values[0]
    pred_fn_pairs[x > boundaries[-1]] = lambda: values[-1]
@ -139,7 +141,7 @@ def piecewise_constant(x, boundaries, values, name=None):
      # Need to bind v here; can do this with lambda v=v: ...
      pred = (x > low) & (x <= high)
      pred_fn_pairs[pred] = lambda v=v: v
-      
+
    # The default isn't needed here because our conditions are mutually
    # exclusive and exhaustive, but tf.case requires it.
    default = lambda: values[0]
@ -237,3 +239,125 @@ def polynomial_decay(learning_rate, global_step, decay_steps,
    return math_ops.add(math_ops.mul(learning_rate - end_learning_rate,
                                     math_ops.pow(1 - p, power)),
                        end_learning_rate, name=name)
 def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate,
                      staircase=False, name=None):
  """Applies natural exponential decay to the initial learning rate.
  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies an exponential decay function
  to a provided initial learning rate.  It requires an `global_step` value to
  compute the decayed learning rate.  You can just pass a TensorFlow variable
  that you increment at each training step.
  The function returns the decayed learning rate.  It is computed as:
  ```python
  decayed_learning_rate = learning_rate * exp(-decay_rate * global_step)
  ```
  Example: decay exponetially with a base of 0.96:
  ```python
  ...
  global_step = tf.Variable(0, trainable=False)
  learning_rate = 0.1
  k = 0.5
  learning_rate = tf.train.exponential_time_decay(learning_rate, global_step, k)
  # Passing global_step to minimize() will increment it at each step.
  learning_step = (
      tf.GradientDescentOptimizer(learning_rate)
      .minimize(...my loss..., global_step=global_step)
  )
  ```
  Args:
    learning_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The initial learning rate.
    global_step: A Python number.
      Global step to use for the decay computation.  Must not be negative.
    decay_rate: A Python number.  The decay rate.
    name: String.  Optional name of the operation.  Defaults to
      'ExponentialTimeDecay'
  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.
  """
  with ops.op_scope([learning_rate, global_step, decay_rate],
                    name, "NaturalExpDecay") as name:
    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    dtype = learning_rate.dtype
    global_step = math_ops.cast(global_step, dtype)
    decay_steps = math_ops.cast(decay_steps, dtype)
    decay_rate = math_ops.cast(decay_rate, dtype)
    p = global_step / decay_steps
    if staircase:
      p = math_ops.floor(p)
    exponent = math_ops.exp(math_ops.mul(math_ops.neg(decay_rate), p))
    return math_ops.mul(learning_rate, exponent, name=name)
 def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
                       staircase=False, name=None):
  """Applies inverse time decay to the initial learning rate.
  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies an inverse decay function
  to a provided initial learning rate.  It requires an `global_step` value to
  compute the decayed learning rate.  You can just pass a TensorFlow variable
  that you increment at each training step.
  The function returns the decayed learning rate.  It is computed as:
  ```python
  decayed_learning_rate = learning_rate / (1 + decay_rate * t)
  ```
  Example: decay 1/t with a rate of 0.5:
  ```python
  ...
  global_step = tf.Variable(0, trainable=False)
  learning_rate = 0.1
  k = 0.5
  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, k)
  # Passing global_step to minimize() will increment it at each step.
  learning_step = (
      tf.GradientDescentOptimizer(learning_rate)
      .minimize(...my loss..., global_step=global_step)
  )
  ```
  Args:
    learning_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The initial learning rate.
    global_step: A Python number.
      Global step to use for the decay computation.  Must not be negative.
    decay_rate: A Python number.  The decay rate.
    name: String.  Optional name of the operation.  Defaults to
      'InverseTimeDecay'
  with ops.op_scope([learning_rate, global_step, decay_rate],
                    name, "InverseTimeDecay") as name:
  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.
  """
  with ops.op_scope([learning_rate, global_step, decay_rate],
                    name, "InverseTimeDecay") as name:
    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    dtype = learning_rate.dtype
    global_step = math_ops.cast(global_step, dtype)
    decay_steps = math_ops.cast(decay_steps, dtype)
    decay_rate = math_ops.cast(decay_rate, dtype)
    p = global_step / decay_steps
    if staircase:
      p = math_ops.floor(p)
    const = math_ops.cast(constant_op.constant(1), learning_rate.dtype)
    denom = math_ops.add(const, math_ops.mul(decay_rate, p))
    return math_ops.div(learning_rate, denom, name=name)
--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import math
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
@ -50,7 +52,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
      self.assertAllClose(decayed_lr.eval(), .1, 1e-6)
      # Decayed learning rate
      assign_100.op.run()
-      expected = .1 * 0.96**(100 // 3)
+      expected = .1 * 0.96 ** (100 // 3)
      self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
  def testVariables(self):
@ -69,7 +71,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
      self.assertAllClose(decayed_lr.eval(), .1, 1e-6)
      # Decayed learning rate
      assign_100.op.run()
-      expected = .1 * 0.96**(100 // 3)
+      expected = .1 * 0.96 ** (100 // 3)
      self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
  def testPiecewiseConstant(self):
@ -215,5 +217,83 @@ class SqrtDecayTest(test_util.TensorFlowTestCase):
      self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
 class ExponentialDecayTest(test_util.TensorFlowTestCase):
  def testDecay(self):
    initial_lr = 0.1
    k = 10
    decay_rate = 0.96
    step = state_ops.variable_op([], dtypes.int32)
    assign_step = state_ops.assign(step, 0)
    increment_step = state_ops.assign_add(step, 1)
    decayed_lr = learning_rate_decay.natural_exp_decay(initial_lr, step,
                                                       k, decay_rate)
    with self.test_session():
      assign_step.op.run()
      for i in range(k+1):
        expected = initial_lr * math.exp(-i / k * decay_rate)
        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
        increment_step.op.run()
  def testStaircase(self):
    initial_lr = 0.1
    k = 10
    decay_rate = 0.96
    step = state_ops.variable_op([], dtypes.int32)
    assign_step = state_ops.assign(step, 0)
    increment_step = state_ops.assign_add(step, 1)
    decayed_lr = learning_rate_decay.natural_exp_decay(initial_lr,
                                                       step,
                                                       k,
                                                       decay_rate,
                                                       staircase=True)
    with self.test_session():
      assign_step.op.run()
      for i in range(k+1):
        expected = initial_lr * math.exp(-decay_rate * (i // k))
        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
        increment_step.op.run()
 class InverseDecayTest(test_util.TensorFlowTestCase):
  def testDecay(self):
    initial_lr = 0.1
    k = 10
    decay_rate = 0.96
    step = state_ops.variable_op([], dtypes.int32)
    assign_step = state_ops.assign(step, 0)
    increment_step = state_ops.assign_add(step, 1)
    decayed_lr = learning_rate_decay.inverse_time_decay(initial_lr,
                                                        step,
                                                        k,
                                                        decay_rate)
    with self.test_session():
      assign_step.op.run()
      for i in range(k+1):
        expected = initial_lr / (1 + i / k * decay_rate)
        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
        increment_step.op.run()
  def testStaircase(self):
    initial_lr = 0.1
    k = 10
    decay_rate = 0.96
    step = state_ops.variable_op([], dtypes.int32)
    assign_step = state_ops.assign(step, 0)
    increment_step = state_ops.assign_add(step, 1)
    decayed_lr = learning_rate_decay.inverse_time_decay(initial_lr,
                                                        step,
                                                        k,
                                                        decay_rate,
                                                        staircase=True)
    with self.test_session():
      assign_step.op.run()
      for i in range(k+1):
        expected = initial_lr / (1 + decay_rate * (i // k))
        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
        increment_step.op.run()
 if __name__ == "__main__":
  googletest.main()
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@ -31,7 +31,7 @@ class MomentumOptimizer(optimizer.Optimizer):
  """
  def __init__(self, learning_rate, momentum,
-               use_locking=False, name="Momentum"):
+               use_locking=False, name="Momentum", use_nesterov=False):
    """Construct a new Momentum optimizer.
    Args:
@ -44,6 +44,7 @@ class MomentumOptimizer(optimizer.Optimizer):
    super(MomentumOptimizer, self).__init__(use_locking, name)
    self._learning_rate = learning_rate
    self._momentum = momentum
    self._use_nesterov = use_nesterov
  def _create_slots(self, var_list):
    for v in var_list:
@ -62,7 +63,8 @@ class MomentumOptimizer(optimizer.Optimizer):
        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
        grad,
        math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
-        use_locking=self._use_locking).op
+        use_locking=self._use_locking,
        use_nesterov=self._use_nesterov).op
  def _apply_sparse(self, grad, var):
    mom = self.get_slot(var, "momentum")
@ -71,4 +73,5 @@ class MomentumOptimizer(optimizer.Optimizer):
        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
        grad.values, grad.indices,
        math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
-        use_locking=self._use_locking).op
+        use_locking=self._use_locking,
        use_nesterov=self._use_nesterov).op
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@ -25,6 +25,13 @@ import tensorflow as tf
 class MomentumOptimizerTest(tf.test.TestCase):
  def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
    var = var + accum * lr * momentum
    accum = accum * momentum + g
    var = var - lr * accum
    var = var - accum * lr * momentum
    return var, accum
  def testBasic(self):
    for dtype in [tf.half, tf.float32, tf.float64]:
      with self.test_session():
@ -80,6 +87,68 @@ class MomentumOptimizerTest(tf.test.TestCase):
                      3.98 - ((0.9 * 0.01 + 0.01) * 2.0)]),
            var1.eval())
  def testNesterovMomentum(self):
    for dtype in [tf.float32, tf.float64]:
      with self.test_session():
        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
        var1 = tf.Variable([3.0, 4.0], dtype=dtype)
        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
        cost = 5 * var0 * var0 + 3 * var1
        global_step = tf.Variable(tf.zeros([], tf.int64), name='global_step')
        mom_op = tf.train.MomentumOptimizer(learning_rate=2.0, momentum=0.9,
            use_nesterov=True)
        opt_op = mom_op.minimize(cost, global_step, [var0, var1])
        tf.initialize_all_variables().run()
        for t in range(1, 5):
          opt_op.run()
          var0_np, accum0_np = self._update_nesterov_momentum_numpy(var0_np,
              accum0_np, var0_np * 10, 2.0, 0.9)
          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
              accum1_np, 3, 2.0, 0.9)
          self.assertAllClose(var0_np, var0.eval())
          self.assertAllClose(var1_np, var1.eval())
  def testSparseNesterovMomentum(self):
    for dtype in [tf.float32, tf.float64]:
      with self.test_session():
        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
        grads = []
        for t in range(1, 5):
          grads.append(var0_np * 10)
          var0_np, accum0_np = self._update_nesterov_momentum_numpy(var0_np,
              accum0_np, var0_np * 10, 2.0, 0.9)
          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
              accum1_np, 3, 2.0, 0.9)
        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
        var0 = tf.Variable(var0_np)
        var1 = tf.Variable(var1_np)
        loss = 5 * var0 * var0 + 3 * var1
        mom_op = tf.train.MomentumOptimizer(learning_rate=2.0, momentum=0.9,
            use_nesterov=True)
        x_feed = tf.placeholder(dtype)
        y_feed = tf.IndexedSlices(x_feed,tf.constant([0, 1]),tf.constant([2]))
        grads_and_vars = [(y_feed, var0),
            (tf.constant([3.0,3.0],dtype=dtype), var1)]
        opt_update = mom_op.apply_gradients(grads_and_vars)
        tf.initialize_all_variables().run()
        for t in range(1, 5):
          opt_update.run(feed_dict = {x_feed:grads[t - 1]})
          var0_np, accum0_np = self._update_nesterov_momentum_numpy(var0_np,
              accum0_np, var0_np * 10, 2.0, 0.9)
          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
              accum1_np, 3, 2.0, 0.9)
          self.assertAllClose(var0_np, var0.eval())
          self.assertAllClose(var1_np, var1.eval())
  def testTensorLearningRateAndMomentum(self):
    for dtype in [tf.half, tf.float32, tf.float64]:
      with self.test_session():
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@ -314,8 +314,17 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
  if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier, (const void**)&cuda_driver_info)) {
    // NOTE: OSX CUDA driver does not currently store the same driver version
    // in kCFBundleVersionKey as is returned by cuDriverGetVersion
    const char * version = CFStringGetCStringPtr((CFStringRef)CFDictionaryGetValue(cuda_driver_info, kCFBundleVersionKey), kCFStringEncodingUTF8);
    CFRelease(kext_infos);
    const CFStringRef str = (CFStringRef)CFDictionaryGetValue(
        cuda_driver_info, kCFBundleVersionKey);
    const char *version = CFStringGetCStringPtr(str, kCFStringEncodingUTF8);
    // version can be NULL in which case treat it as empty string
    // see
    // https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112
    if (version == NULL) {
      return StringToDriverVersion("");
    }
    return StringToDriverVersion(version);
  }
  CFRelease(kext_infos);
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@ -54,6 +54,15 @@ NarrowT CheckedNarrowing(const WideT& wide) {
  return narrow;
 }
 // Returns the "Compatibility" version number from the CuDNN version number.
 // This is the number that tries to indicate ABI compatibility.
 //
 // For example, if cudnn_version is 5107, the compatibility version
 // number will be 5100.
 size_t cudnnCompatibilityVersion(size_t cudnn_version) {
  return (cudnn_version / 100) * 100;
 }
 }  // namespace
 namespace perftools {
@ -139,13 +148,6 @@ size_t cudnnGetVersion() {
  return callable();
 }
 // Returns whether the currently loaded cuDNN version is R2.
 bool IsCudnnR2() {
  static auto version = cudnnGetVersion();
  DCHECK_GE(version, 2000);
  return version < 3000;
 }
 #define PERFTOOLS_GPUTOOLS_CUDNN_WRAP(__name)                        \
  struct DynLoadShim__##__name {                                     \
    static const char* kName;                                        \
@ -197,26 +199,13 @@ bool IsCudnnR2() {
  __macro(cudnnPoolingForward)                            \
  __macro(cudnnPoolingBackward)                           \
  __macro(cudnnLRNCrossChannelForward)                    \
-  __macro(cudnnLRNCrossChannelBackward)
+  __macro(cudnnLRNCrossChannelBackward)                   \
 // clang-format on
 CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 // clang-format off
 #if CUDNN_VERSION >= 4000 && CUDNN_VERSION < 5000
 #define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
  __macro(cudnnAddTensor_v2)                              \
  __macro(cudnnConvolutionBackwardData_v2)                \
  __macro(cudnnConvolutionBackwardFilter_v2)
 #else
 #define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
  __macro(cudnnAddTensor)                                 \
  __macro(cudnnConvolutionBackwardData)                   \
  __macro(cudnnConvolutionBackwardFilter)
 #endif
 // clang-format on
-CUDNN_DNN_ROUTINE_EACH_R2(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 // APIs available after R3:
 #if CUDNN_VERSION >= 3000
@ -340,15 +329,21 @@ port::Status CudnnSupport::Init() {
    // Check whether loaded version of CuDNN matches what the source
    // was built with.
    size_t loaded_version = dynload::cudnnGetVersion();
-    bool library_loaded_matches_source = (loaded_version == CUDNN_VERSION);
+    size_t loaded_compat_version = cudnnCompatibilityVersion(loaded_version);
    size_t compiled_compat_version = cudnnCompatibilityVersion(CUDNN_VERSION);
    bool library_loaded_matches_source =
        (loaded_compat_version == compiled_compat_version);
    if (!library_loaded_matches_source) {
      const string error =
-          port::StrCat("Loaded cudnn library: ", loaded_version,
+          port::StrCat("Loaded runtime CuDNN library: ", loaded_version,
-                       " but source was compiled against ", CUDNN_VERSION,
+                       " (compatibility version ", loaded_compat_version,
-                       ".  If using a binary install, upgrade your cudnn "
+                       ") but source was compiled with ", CUDNN_VERSION,
                       " (compatibility version ", compiled_compat_version,
                       ").  If using a binary install, upgrade your CuDNN "
                       "library to match.  If building from sources, "
-                       "make sure the library loaded matches the "
+                       "make sure the library loaded at runtime matches a "
-                       "version you specified during compile configuration.");
+                       "compatible version specified during compile "
                       "configuration.");
      LOG(ERROR) << error;
      return port::Status{port::error::INTERNAL, error};
    }
@ -1109,31 +1104,6 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
                                   CUDNN_DATA_FLOAT};
 #if CUDNN_VERSION < 5000
 #if CUDNN_VERSION >= 3000
  if (dynload::IsCudnnR2()) {
 #endif
 #if CUDNN_VERSION >= 4000
    status = dynload::cudnnConvolutionBackwardData_v2(
 #else
  status = dynload::cudnnConvolutionBackwardData(
 #endif
        parent_, ToHandle(dnn_handle_), &alpha, filter.handle(),
        filter_data.opaque(), out_back_nd.handle(),
        backward_output_data.opaque(), conv.handle(), &beta,
        in_back_nd.handle(), backward_input_data->opaque());
    if (status != CUDNN_STATUS_SUCCESS) {
      LOG(FATAL) << "failed to enqueue convolution on stream: "
                 << ToString(status);
      return false;
    }
    return true;
 #if CUDNN_VERSION >= 3000
  }
 #endif
 #endif
 #if CUDNN_VERSION >= 3000
  const bool is_profiling = output_profile_result != nullptr;
  cudnnConvolutionBwdDataAlgo_t algo;
  DeviceMemory<uint8> scratch;
@ -1284,7 +1254,6 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
    return false;
  }
  return true;
 #endif
 }
 bool CudnnSupport::DoConvolveBackwardData(
@ -1369,31 +1338,6 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
      CUDNN_DATA_FLOAT};
 #if CUDNN_VERSION < 5000
 #if CUDNN_VERSION >= 3000
  if (dynload::IsCudnnR2()) {
 #endif
 #if CUDNN_VERSION >= 4000
    status = dynload::cudnnConvolutionBackwardFilter_v2(
 #else
  status = dynload::cudnnConvolutionBackwardFilter(
 #endif
        parent_, ToHandle(dnn_handle_), &alpha, input_nd.handle(),
        input_data.opaque(), out_back_nd.handle(),
        backward_output_data.opaque(), conv.handle(), &beta, filter.handle(),
        backward_filter_data->opaque());
    if (status != CUDNN_STATUS_SUCCESS) {
      LOG(FATAL) << "failed to enqueue convolution on stream: "
                 << ToString(status);
      return false;
    }
    return true;
 #if CUDNN_VERSION >= 3000
  }
 #endif
 #endif
 #if CUDNN_VERSION >= 3000
  const bool is_profiling = output_profile_result != nullptr;
  cudnnConvolutionBwdFilterAlgo_t algo;
  DeviceMemory<uint8> scratch;
@ -1544,7 +1488,6 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
    return false;
  }
  return true;
 #endif
 }
 bool CudnnSupport::DoConvolveBackwardFilter(
@ -1824,33 +1767,15 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
  const float alpha = 1.0f;
  const float beta = 1.0f;
 #if CUDNN_VERSION >= 3000
  if (dynload::IsCudnnR2()) {
 #endif
 #if CUDNN_VERSION < 5000
 #if CUDNN_VERSION >= 4000
    status = dynload::cudnnAddTensor_v2(
 #else
    status = dynload::cudnnAddTensor(
 #endif
        parent_, ToHandle(dnn_handle_), CUDNN_ADD_SAME_C, &alpha,
        bias_descriptor.handle(), biases.opaque(), &beta,
        input_descriptor.handle(), output_data->opaque());
 #endif  // CUDNN_VERSION < 5000
 #if CUDNN_VERSION >= 3000
  } else {
 #if CUDNN_VERSION >= 5000
-    status = dynload::cudnnAddTensor(
+  status = dynload::cudnnAddTensor(
 #else
-    status = dynload::cudnnAddTensor_v3(
+  status = dynload::cudnnAddTensor_v3(
 #endif
        parent_, ToHandle(dnn_handle_), &alpha, bias_descriptor.handle(),
        biases.opaque(), &beta, input_descriptor.handle(),
        output_data->opaque());
  }
 #endif
      parent_, ToHandle(dnn_handle_), &alpha, bias_descriptor.handle(),
      biases.opaque(), &beta, input_descriptor.handle(),
      output_data->opaque());
  if (status != CUDNN_STATUS_SUCCESS) {
    LOG(ERROR) << "stream " << stream << " could not enqueue bias addition.";
--- a/tensorflow/tensorboard/BUILD
+++ b/tensorflow/tensorboard/BUILD
@ -10,10 +10,10 @@ exports_files(["LICENSE"])
 filegroup(
    name = "frontend",
    srcs = [
        "TAG",
        "dist/index.html",
        "dist/tf-tensorboard.html",
-        "TAG",
+        "//tensorflow/tensorboard/bower",
        "//tensorflow/tensorboard/bower:bower",
        "//tensorflow/tensorboard/lib:all_files",
    ],
 )
--- a/tensorflow/tensorboard/README.md
+++ b/tensorflow/tensorboard/README.md
@ -21,7 +21,7 @@ directory by creating a `SummaryWriter`:
 ``` python
 # sess.graph_def is the graph definition; that enables the Graph Visualizer.
-summary_writer = tf.train.SummaryWriter('/path/to/logs', sess.graph_def)
+summary_writer = tf.train.SummaryWriter('/path/to/logs', sess.graph)
 ```
 For more details, see [this
@ -115,9 +115,9 @@ For example, here is a well-organized TensorBoard log directory, with two runs,
 # The Visualizations
-### Scalar Dashboard
+### Events Dashboard
-TensorBoard's Scalar Dashboard visualizes scalar statistics that vary over time;
+TensorBoard's Events Dashboard visualizes scalar statistics that vary over time;
 for example, you might want to track the model's loss or learning rate. As
 described in *Key Concepts*, you can compare multiple runs, and the data is
 organized by tag. The line charts have the following interactions:
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@ -49,10 +49,11 @@
 # to run.
 #
 # Constants:
 # Fixed naming patterns for wheel (.whl) files given different python versions
-declare -A WHL_TAGS
+if [[ $(uname) == "Linux" ]]; then
-WHL_TAGS=(["2.7"]="cp27-none" ["3.4"]="cp34-cp34m" ["3.5"]="cp35-cp35m")
+  declare -A WHL_TAGS
  WHL_TAGS=(["2.7"]="cp27-none" ["3.4"]="cp34-cp34m" ["3.5"]="cp35-cp35m")
 fi
 INSTALL_EXTRA_PIP_PACKAGES=${TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES}
--- a/tensorflow/tools/ci_build/builds/test_installation.sh
+++ b/tensorflow/tools/ci_build/builds/test_installation.sh
@ -243,6 +243,8 @@ rm -rf ${PY_TEST_DIR}/tensorflow/core/lib/jpeg
 cp -r tensorflow/core/lib/jpeg ${PY_TEST_DIR}/tensorflow/core/lib
 rm -rf ${PY_TEST_DIR}/tensorflow/core/lib/png
 cp -r tensorflow/core/lib/png ${PY_TEST_DIR}/tensorflow/core/lib
 rm -rf ${PY_TEST_DIR}/tensorflow/core/lib/gif
 cp -r tensorflow/core/lib/gif ${PY_TEST_DIR}/tensorflow/core/lib
 # Copy test data from tensorflow/contrib/ffmpeg
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@ -174,24 +174,57 @@ function get_cuda_capability_version() {
  fi
 }
-# Process container type
+# Container type, e.g., CPU, GPU
 CTYPE=${TF_BUILD_CONTAINER_TYPE}
 # Determine if Docker is available
 OPT_FLAG=""
 if [[ -z "$(which docker)" ]]; then
  DO_DOCKER=0
  echo "It appears that Docker is not available on this system. "\
 "Will perform build without Docker."
  echo "Also, the additional option flags will be applied to the build:"
  echo "  ${NO_DOCKER_OPT_FLAG}"
  MAIN_CMD="${NO_DOCKER_MAIN_CMD} ${CTYPE}"
  OPT_FLAG="${OPT_FLAG} ${NO_DOCKER_OPT_FLAG}"
 fi
 # Process container type
 if [[ ${CTYPE} == "cpu" ]]; then
  :
 elif [[ ${CTYPE} == "gpu" ]]; then
-  OPT_FLAG="--config=cuda"
+  OPT_FLAG="${OPT_FLAG} --config=cuda"
-  # Attempt to determine CUDA capability version and use it
+  # Attempt to determine CUDA capability version automatically and use it if
-  if [[ "${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS}" != \
+  # CUDA capability version is not specified by the environment variables.
-        *"TF_CUDA_COMPUTE_CAPABILITIES="* ]]; then
+  CUDA_CAPA_VER=$(get_cuda_capability_version)
-    CUDA_CAPA_VER=$(get_cuda_capability_version)
+
-    if [[ ! -z ${CUDA_CAPA_VER} ]]; then
+  if [[ ! -z ${CUDA_CAPA_VER} ]]; then
-      echo "TF_CUDA_COMPUTE_CAPABILITIES is not set."
+    AUTO_CUDA_CAPA_VER=0
-      echo "Using CUDA capability version from deviceQuery: ${CUDA_CAPA_VER}"
+    if [[ ${DO_DOCKER} == "1" ]] && \
       [[ "${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS}" != \
           *"TF_CUDA_COMPUTE_CAPABILITIES="* ]]; then
      AUTO_CUDA_CAPA_VER=1
      TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS=\
 "${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS} -e "\
 "TF_CUDA_COMPUTE_CAPABILITIES=${CUDA_CAPA_VER}"
      echo "Docker GPU build: TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS="\
 "\"${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS}\""
    elif [[ ${DO_DOCKER} == "0" ]] && \
         [[ -z "${TF_CUDA_COMPUTE_CAPABILITIES}" ]]; then
      AUTO_CUDA_CAPA_VER=1
      TF_CUDA_COMPUTE_CAPABILITIES="${CUDA_CAPA_VER}"
      echo "Non-Docker GPU build: TF_CUDA_COMPUTE_CAPABILITIES="\
 "\"${TF_CUDA_COMPUTE_CAPABILITIES}\""
    fi
    if [[ ${AUTO_CUDA_CAPA_VER} == "1" ]]; then
      echo "TF_CUDA_COMPUTE_CAPABILITIES is not set:"
      echo "Using CUDA capability version from deviceQuery: ${CUDA_CAPA_VER}"
      echo ""
    fi
  fi
 elif [[ ${CTYPE} == "android" ]]; then
@ -203,19 +236,6 @@ fi
 EXTRA_PARAMS=""
 # Determine if Docker is available
 if [[ -z "$(which docker)" ]]; then
  DO_DOCKER=0
  echo "It appears that Docker is not available on this system. "\
 "Will perform build without Docker."
  echo "Also, the additional option flags will be applied to the build:"
  echo "  ${NO_DOCKER_OPT_FLAG}"
  MAIN_CMD="${NO_DOCKER_MAIN_CMD} ${CTYPE}"
  OPT_FLAG="${OPT_FLAG} ${NO_DOCKER_OPT_FLAG}"
 fi
 # Determine if this is a benchmarks job
 RUN_BENCHMARKS=0
 if [[ ! -z "${TF_BUILD_RUN_BENCHMARKS}" ]] &&
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@ -80,7 +80,7 @@ RUN mkdir /bazel && \
 # Download and build TensorFlow.
-RUN git clone --recursive https://github.com/tensorflow/tensorflow.git && \
+RUN git clone -b r0.9 --recursive --recurse-submodules https://github.com/tensorflow/tensorflow.git && \
    cd tensorflow && \
    git checkout r0.9
 WORKDIR /tensorflow
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@ -16,7 +16,9 @@ RUN ./install_google_cloud_sdk.bash --disable-prompts --install-dir=/var/gcloud
 # Install nightly TensorFlow pip
 RUN pip install \
-   http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-py2-none-any.whl
+   http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
 # Copy test files
-COPY python/gcs_smoke.py /
+RUN mkdir -p /gcs-smoke/python
 COPY gcs_smoke_wrapper.sh /gcs-smoke/
 COPY python/gcs_smoke.py /gcs-smoke/python/
--- a/tensorflow/tools/gcs_test/gcs_smoke.sh
+++ b/tensorflow/tools/gcs_test/gcs_smoke.sh
@ -67,30 +67,8 @@ docker build --no-cache \
 # Run the docker image with the GCS key file mapped and the gcloud-required
 # environment variables set.
 LOG_FILE="/tmp/tf-gcs-test.log"
 rm -rf ${LOG_FILE}
 docker run --rm \
    -v ${GCLOUD_JSON_KEY_PATH}:/gcloud-key.json \
    -e "GOOGLE_APPLICATION_CREDENTIALS=/gcloud-key.json" \
    "${DOCKER_IMG}" \
-    python /gcs_smoke.py --gcs_bucket_url="${GCS_BUCKET_URL}" \
+    /gcs-smoke/gcs_smoke_wrapper.sh "${GCS_BUCKET_URL}"
    2>&1 > "${LOG_FILE}"
 if [[ $? != "0" ]]; then
  cat ${LOG_FILE}
  die "FAIL: End-to-end test of GCS access from TensorFlow failed."
 fi
 cat ${LOG_FILE}
 echo ""
 # Clean up the newly created tfrecord file in GCS bucket
 NEW_TFREC_URL=$(grep "Using input path" "${LOG_FILE}" | \
                awk '{print $NF}')
 if [[ -z ${NEW_TFREC_URL} ]]; then
  die "FAIL: Unable to determine the URL to the new tfrecord file in GCS"
 fi
 gsutil rm "${NEW_TFREC_URL}" && \
    echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}" || \
    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
--- a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
+++ b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
@ -0,0 +1,98 @@
 #!/usr/bin/env bash
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 #
 # In-container wrapper for GCS smoke test.
 #
 # This script invokes gcs_smoke.py and performs tear down afterwards.
 #
 # Usage:
 #   gcs_smoke_wrapper.sh <GCS_BUCKET_URL>
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 # Helper function: Exit on failure.
 die () {
  echo $@
  exit 1
 }
 print_usage() {
  echo "Usage: gcs_smoke_wrapper.sh <GCS_BUCKET_URL>"
  echo ""
 }
 # Sanity check on command-line arguments.
 GCS_BUCKET_URL=$1
 if [[ -z "${GCS_BUCKET_URL}" ]]; then
  print_usage
  die "ERROR: Command-line argument GCS_BUCKET_URL is not supplied"
 fi
 # Check that gcloud and gsutil binaries are available.
 GCLOUD_BIN="/var/gcloud/google-cloud-sdk/bin/gcloud"
 if [[ ! -f "${GCLOUD_BIN}" ]]; then
  die "ERROR: Unable to find gcloud at path ${GCLOUD_BIN}"
 fi
 GSUTIL_BIN="/var/gcloud/google-cloud-sdk/bin/gsutil"
 if [[ ! -f "${GSUTIL_BIN}" ]]; then
  die "ERROR: Unable to find gsutil at path ${GSUTIL_BIN}"
 fi
 # Check environment variable for gcloud credentials
 if [[ -z "${GOOGLE_APPLICATION_CREDENTIALS}" ]]; then
  die "ERROR: Required gcloud environment variable "\
 "${GOOGLE_APPLICATION_CREDENTIALS} is not set."
 fi
 # Locate main Python file
 GCS_SMOKE_PY="${SCRIPT_DIR}/python/gcs_smoke.py"
 if [[ ! -f "${GCS_SMOKE_PY}" ]]; then
  die "ERROR: Unable to find Python file at ${GCS_SMOKE_PY}"
 fi
 LOG_FILE="/tmp/tf-gcs-test.log"
 rm -rf ${LOG_FILE} || \
    die "ERROR: Failed to remove existing log file ${LOG_FILE}"
 # Invoke main Python file
 python "${GCS_SMOKE_PY}" --gcs_bucket_url="${GCS_BUCKET_URL}" \
    2>&1 > "${LOG_FILE}"
 if [[ $? != "0" ]]; then
  cat ${LOG_FILE}
  die "FAIL: End-to-end test of GCS access from TensorFlow failed."
 fi
 cat ${LOG_FILE}
 echo ""
 # Clean up the newly created tfrecord file in GCS bucket.
 # First, activate gcloud service account
 "${GCLOUD_BIN}" auth activate-service-account \
    --key-file "${GOOGLE_APPLICATION_CREDENTIALS}" || \
    die "ERROR: Failed to activate gcloud service account with JSON key file"
 NEW_TFREC_URL=$(grep "Using input path" "${LOG_FILE}" | \
                awk '{print $NF}')
 if [[ -z ${NEW_TFREC_URL} ]]; then
  die "FAIL: Unable to determine the URL to the new tfrecord file in GCS"
 fi
 "${GSUTIL_BIN}" rm "${NEW_TFREC_URL}" && \
    echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}" || \
    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@ -8,8 +8,8 @@ load("//tensorflow:tensorflow.bzl", "transitive_hdrs")
 transitive_hdrs(
    name = "other_headers",
    deps = [
        "//third_party/eigen3",
        "//tensorflow/core:protos_all_cc",
        "//third_party/eigen3",
    ],
 )
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@ -108,21 +108,16 @@ class InstallHeaders(Command):
    # directories for -I
    install_dir = re.sub('/google/protobuf/src', '', install_dir)
-    # Copy eigen code into tensorflow/include,
+    # Copy eigen code into tensorflow/include.
    # tensorflow/include/external/eigen_archive/eigen-eigen-<revision>,
    # and tensorflow/include/eigen-eigen-<revision>.
    # A symlink would do, but the wheel file that gets created ignores
    # symlink within the directory hierarchy.
    # NOTE(keveman): Figure out how to customize bdist_wheel package so
    # we can do the symlink.
-    if re.search(r'(external/eigen_archive/eigen-eigen-\w+)', install_dir):
+    if 'external/eigen_archive/' in install_dir:
-      extra_dirs = [re.sub('/external/eigen_archive', '', install_dir),
+      extra_dir = install_dir.replace('external/eigen_archive', '')
-                    re.sub(r'external/eigen_archive/eigen-eigen-\w+', '',
+      if not os.path.exists(extra_dir):
-                           install_dir)]
+        self.mkpath(extra_dir)
-      for extra_dir in extra_dirs:
+      self.copy_file(header, extra_dir)
        if not os.path.exists(extra_dir):
          self.mkpath(extra_dir)
        self.copy_file(header, extra_dir)
    if not os.path.exists(install_dir):
      self.mkpath(install_dir)
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -4,10 +4,17 @@
 # within the workspace (e.g. "tensorflow/"), and tf_repo_name is the name of the
 # local_repository rule (e.g. "@tf").
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
  # These lines need to be changed when updating Eigen. They are parsed from
  # this file by the cmake and make builds to determine the eigen version and hash.
  eigen_version = "b4fa9622b809"
  eigen_sha256 = "2862840c2de9c0473a4ef20f8678949ae89ab25965352ee53329e63ba46cec62"
  native.new_http_archive(
    name = "eigen_archive",
-    url = "https://bitbucket.org/eigen/eigen/get/b4fa9622b809.tar.gz",
+    url = "https://bitbucket.org/eigen/eigen/get/" + eigen_version + ".tar.gz",
-    sha256 = "2862840c2de9c0473a4ef20f8678949ae89ab25965352ee53329e63ba46cec62",
+    sha256 = eigen_sha256,
    strip_prefix = "eigen-eigen-" + eigen_version,
    build_file = path_prefix + "eigen.BUILD",
  )
@ -56,6 +63,13 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
    build_file = path_prefix + "png.BUILD",
  )
  native.new_http_archive(
    name = "gif_archive",
    url = "http://ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
    sha256 = "34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1",
    build_file = path_prefix + "gif.BUILD",
  )
  native.new_http_archive(
    name = "six_archive",
    url = "https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz#md5=34eed507548117b2ab523ab14b2f8b55",
@ -92,8 +106,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
  )
  native.bind(
-      name = "python_headers",
+    name = "python_headers",
-      actual = tf_repo_name + "//util/python:python_headers",
+    actual = tf_repo_name + "//util/python:python_headers",
  )
  # grpc expects //external:protobuf_clib and //external:protobuf_compiler
@ -141,9 +155,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
  )
  native.git_repository(
-      name = "boringssl_git",
+    name = "boringssl_git",
-      remote = "https://github.com/google/boringssl.git",
+    remote = "https://github.com/google/boringssl.git",
-      commit = "bbcaa15b0647816b9a1a9b9e0d209cd6712f0105",  # 2016-07-11
+    commit = "bbcaa15b0647816b9a1a9b9e0d209cd6712f0105",  # 2016-07-11
  )
  native.new_git_repository(
--- a/third_party/avro/BUILD
+++ b/third_party/avro/BUILD
@ -1,4 +1,3 @@
 package(default_visibility = ["//visibility:public"])
 licenses(["notice"])  # Apache 2.0
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@ -13,7 +13,6 @@ cc_library(
        "unsupported/Eigen/CXX11/FixedPoint",
        "unsupported/Eigen/CXX11/src/FixedPoint/*.h",
    ]),
    includes = ["."],
    visibility = ["//visibility:public"],
    deps = [
        "@eigen_archive//:eigen",
--- a/third_party/eigen3/Eigen/Cholesky
+++ b/third_party/eigen3/Eigen/Cholesky
@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/Cholesky"
+#include "Eigen/Cholesky"
--- a/third_party/eigen3/Eigen/Core
+++ b/third_party/eigen3/Eigen/Core
@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/Core"
+#include "Eigen/Core"
--- a/third_party/eigen3/Eigen/Eigenvalues
+++ b/third_party/eigen3/Eigen/Eigenvalues
@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/Eigenvalues"
+#include "Eigen/Eigenvalues"
--- a/third_party/eigen3/Eigen/LU
+++ b/third_party/eigen3/Eigen/LU
@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/LU"
+#include "Eigen/LU"
--- a/third_party/eigen3/Eigen/QR
+++ b/third_party/eigen3/Eigen/QR
@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/QR"
+#include "Eigen/QR"
--- a/Show More
+++ b/Show More
`@ -1,4 +1,3 @@`
	`package(default_visibility = ["//visibility:public"])`	`package(default_visibility = ["//visibility:public"])`

	`licenses(["notice"]) # Apache 2.0`	`licenses(["notice"]) # Apache 2.0`
`@ -1 +1 @@`
	`#include "eigen-eigen-b4fa9622b809/Eigen/Cholesky"`	`#include "Eigen/Cholesky"`
`@ -1 +1 @@`
	`#include "eigen-eigen-b4fa9622b809/Eigen/Core"`	`#include "Eigen/Core"`
`@ -1 +1 @@`
	`#include "eigen-eigen-b4fa9622b809/Eigen/Eigenvalues"`	`#include "Eigen/Eigenvalues"`
`@ -1 +1 @@`
	`#include "eigen-eigen-b4fa9622b809/Eigen/LU"`	`#include "Eigen/LU"`
`@ -1 +1 @@`
	`#include "eigen-eigen-b4fa9622b809/Eigen/QR"`	`#include "Eigen/QR"`