Merge changes from github.

Change: 128401884
2016-07-25 13:48:16 -08:00 · 2016-07-25 13:48:16 -08:00 · 21716d8f6e
commit 21716d8f6e
parent ed281973d6
105 changed files with 2576 additions and 1144 deletions
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@ -18,7 +18,10 @@ If installed from binary pip package, provide:
 1. Which pip package you installed.
 2. The output from `python -c "import tensorflow; print(tensorflow.__version__)"`.

-If installed from sources, provide the commit hash:
+If installed from source, provide 
+
+1. The commit hash (`git rev-parse HEAD`)
+2. The output of `bazel version`

 ### Steps to reproduce
 1.
--- a/eigen.BUILD
+++ b/eigen.BUILD
@ -1,9 +1,8 @@
 package(default_visibility = ["//visibility:public"])

-archive_dir = "eigen-eigen-b4fa9622b809"
 cc_library(
    name = "eigen",
-    hdrs = glob([archive_dir+"/**/*.h", archive_dir+"/unsupported/Eigen/*", archive_dir+"/unsupported/Eigen/CXX11/*", archive_dir+"/Eigen/*"]),
-    includes = [ archive_dir ],
+    hdrs = glob(["**/*.h", "unsupported/Eigen/*", "unsupported/Eigen/CXX11/*", "Eigen/*"]),
+    includes = [ '.' ],
    visibility = ["//visibility:public"],
 )
--- a/gif.BUILD
+++ b/gif.BUILD
@ -0,0 +1,23 @@
+SOURCES = [
+    "dgif_lib.c",
+    "egif_lib.c",
+    "gif_font.c",
+    "gif_hash.c",
+    "gifalloc.c",
+    "openbsd-reallocarray.c",
+    "gif_err.c",
+    "quantize.c",
+]
+
+prefix_dir = "giflib-5.1.4/lib"
+
+cc_library(
+    name = "gif",
+    srcs = [prefix_dir + "/" + source for source in SOURCES],
+    hdrs = [prefix_dir + "/gif_lib.h"],
+    includes = [prefix_dir],
+    defines = [
+        "HAVE_CONFIG_H",
+    ],
+    visibility = ["//visibility:public"],
+)
--- a/tensorflow/contrib/cmake/external/eigen.cmake
+++ b/tensorflow/contrib/cmake/external/eigen.cmake
@ -7,16 +7,30 @@

 include (ExternalProject)

-set(eigen_archive_hash "b4fa9622b809")
+# We parse the current Eigen version and archive hash from the bazel configuration
+file(STRINGS ${PROJECT_SOURCE_DIR}/../../workspace.bzl workspace_contents)
+foreach(line ${workspace_contents})
+    string(REGEX MATCH ".*eigen_version.*=.*\"(.*)\"" has_version ${line})
+    if(has_version)
+        set(eigen_version ${CMAKE_MATCH_1})
+        break()
+    endif()
+endforeach()
+foreach(line ${workspace_contents})
+    string(REGEX MATCH ".*eigen_sha256.*=.*\"(.*)\"" has_hash ${line})
+    if(has_hash)
+        set(eigen_hash ${CMAKE_MATCH_1})
+        break()
+    endif()
+endforeach()

 set(eigen_INCLUDE_DIRS
    ${CMAKE_CURRENT_BINARY_DIR}
    ${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive
-    ${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive/eigen-eigen-${eigen_archive_hash}
    ${tensorflow_source_dir}/third_party/eigen3
 )
-set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
-set(eigen_HASH SHA256=2862840c2de9c0473a4ef20f8678949ae89ab25965352ee53329e63ba46cec62)
+set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_version}.tar.gz)
+set(eigen_HASH SHA256=${eigen_hash})
 set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
 set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)

@ -30,5 +44,5 @@ ExternalProject_Add(eigen
        -DCMAKE_BUILD_TYPE:STRING=Release
        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
        -DCMAKE_INSTALL_PREFIX:STRING=${eigen_INSTALL}
-        -DINCLUDE_INSTALL_DIR:STRING=${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive/eigen-eigen-${eigen_archive_hash}
+        -DINCLUDE_INSTALL_DIR:STRING=${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive
 )
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@ -55,12 +55,8 @@ class KMeansClustering(estimator.Estimator,
               distance_metric=clustering_ops.SQUARED_EUCLIDEAN_DISTANCE,
               random_seed=0,
               use_mini_batch=True,
-               batch_size=128,
-               steps=10,
               kmeans_plus_plus_num_retries=2,
-               continue_training=False,
-               config=None,
-               verbose=1):
+               config=None):
    """Creates a model for running KMeans training and inference.

    Args:
@ -73,25 +69,17 @@ class KMeansClustering(estimator.Estimator,
      random_seed: Python integer. Seed for PRNG used to initialize centers.
      use_mini_batch: If true, use the mini-batch k-means algorithm. Else assume
        full batch.
-      batch_size: See TensorFlowEstimator
-      steps: See TensorFlowEstimator
      kmeans_plus_plus_num_retries: For each point that is sampled during
        kmeans++ initialization, this parameter specifies the number of
        additional points to draw from the current distribution before selecting
        the best. If a negative value is specified, a heuristic is used to
        sample O(log(num_to_sample)) additional points.
-      continue_training: See TensorFlowEstimator
-      config: See TensorFlowEstimator
-      verbose: See TensorFlowEstimator
+      config: See Estimator
    """
    super(KMeansClustering, self).__init__(
        model_dir=model_dir,
        config=config)
-    self.batch_size = batch_size
-    self.steps = steps
    self.kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
-    self.continue_training = continue_training
-    self.verbose = verbose
    self._num_clusters = num_clusters
    self._training_initial_clusters = initial_clusters
    self._training_graph = None
@ -135,11 +123,11 @@ class KMeansClustering(estimator.Estimator,
      return relative_change < self._tolerance
 # pylint: enable=protected-access

-  def fit(self, x, y=None, monitors=None, logdir=None, steps=None,
+  def fit(self, x, y=None, monitors=None, logdir=None, steps=None, batch_size=128,
          relative_tolerance=None):
    """Trains a k-means clustering on x.

-    Note: See TensorFlowEstimator for logic for continuous training and graph
+    Note: See Estimator for logic for continuous training and graph
      construction across multiple calls to fit.

    Args:
@ -151,6 +139,7 @@ class KMeansClustering(estimator.Estimator,
        visualization.
      steps: number of training steps. If not None, overrides the value passed
        in constructor.
+      batch_size: mini-batch size to use. Requires `use_mini_batch=True`.
      relative_tolerance: A relative tolerance of change in the loss between
        iterations.  Stops learning if the loss changes less than this amount.
        Note that this may not work correctly if use_mini_batch=True.
@ -162,7 +151,7 @@ class KMeansClustering(estimator.Estimator,
    if logdir is not None:
      self._model_dir = logdir
    self._data_feeder = data_feeder.setup_train_data_feeder(
-        x, None, self._num_clusters, self.batch_size)
+        x, None, self._num_clusters, batch_size if self._use_mini_batch else None)
    if relative_tolerance is not None:
      if monitors is not None:
        monitors += [self._StopWhenConverged(relative_tolerance)]
@ -173,7 +162,7 @@ class KMeansClustering(estimator.Estimator,
            or (self.steps is not None))
    self._train_model(input_fn=self._data_feeder.input_builder,
                      feed_fn=self._data_feeder.get_feed_dict_fn(),
-                      steps=steps or self.steps,
+                      steps=steps,
                      monitors=monitors,
                      init_feed_fn=self._data_feeder.get_feed_dict_fn())
    return self
--- a/tensorflow/contrib/factorization/python/ops/kmeans_test.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
@ -53,13 +53,14 @@ class KMeansTest(tf.test.TestCase):

    self.kmeans = KMeans(self.num_centers,
                         initial_clusters=kmeans_ops.RANDOM_INIT,
-                         batch_size=self.batch_size,
                         use_mini_batch=self.use_mini_batch,
-                         steps=30,
-                         continue_training=True,
-                         config=run_config.RunConfig(tf_random_seed=14),
+                         config=self.config(14),
                         random_seed=12)

+  @staticmethod
+  def config(tf_random_seed):
+    return run_config.RunConfig(tf_random_seed=tf_random_seed)
+
  @property
  def batch_size(self):
    return self.num_points
@ -86,7 +87,7 @@ class KMeansTest(tf.test.TestCase):

  def test_clusters(self):
    kmeans = self.kmeans
-    kmeans.fit(x=self.points, steps=0)
+    kmeans.fit(x=self.points, steps=1, batch_size=8)
    clusters = kmeans.clusters()
    self.assertAllEqual(list(clusters.shape),
                        [self.num_centers, self.num_dims])
@ -97,10 +98,11 @@ class KMeansTest(tf.test.TestCase):
      return
    kmeans = self.kmeans
    kmeans.fit(x=self.points,
-               steps=1)
+               steps=1, batch_size=self.batch_size)
    score1 = kmeans.score(x=self.points)
    kmeans.fit(x=self.points,
-               steps=15 * self.num_points // self.batch_size)
+               steps=15 * self.num_points // self.batch_size,
+               batch_size=self.batch_size)
    score2 = kmeans.score(x=self.points)
    self.assertTrue(score1 > score2)
    self.assertNear(self.true_score, score2, self.true_score * 0.05)
@ -111,39 +113,36 @@ class KMeansTest(tf.test.TestCase):
      return
    kmeans = KMeans(self.num_centers,
                    initial_clusters=kmeans_ops.RANDOM_INIT,
-                    batch_size=self.batch_size,
                    use_mini_batch=self.use_mini_batch,
-                    # Force it to train forever until the monitor stops it.
-                    steps=None,
-                    continue_training=True,
                    config=run_config.RunConfig(tf_random_seed=14),
                    random_seed=12)

    kmeans.fit(x=self.points,
               # Force it to train forever until the monitor stops it.
               steps=None,
+               batch_size=self.batch_size,
               relative_tolerance=1e-4)
    score = kmeans.score(x=self.points)
    self.assertNear(self.true_score, score, self.true_score * 0.005)

  def test_infer(self):
    kmeans = self.kmeans
-    kmeans.fit(x=self.points)
+    kmeans.fit(x=self.points, steps=10, batch_size=128)
    clusters = kmeans.clusters()

    # Make a small test set
    points, true_assignments, true_offsets = self.make_random_points(clusters,
                                                                     10)
    # Test predict
-    assignments = kmeans.predict(points)
+    assignments = kmeans.predict(points, batch_size=self.batch_size)
    self.assertAllEqual(assignments, true_assignments)

    # Test score
-    score = kmeans.score(points)
+    score = kmeans.score(points, batch_size=128)
    self.assertNear(score, np.sum(true_offsets), 0.01 * score)

    # Test transform
-    transform = kmeans.transform(points)
+    transform = kmeans.transform(points, batch_size=128)
    true_transform = np.maximum(
        0,
        np.sum(np.square(points), axis=1, keepdims=True) -
@ -161,12 +160,9 @@ class KMeansTest(tf.test.TestCase):
                    initial_clusters=kmeans_ops.RANDOM_INIT,
                    distance_metric=kmeans_ops.COSINE_DISTANCE,
                    use_mini_batch=self.use_mini_batch,
-                    batch_size=4,
-                    steps=30,
-                    continue_training=True,
-                    config=run_config.RunConfig(tf_random_seed=2),
+                    config=self.config(2),
                    random_seed=12)
-    kmeans.fit(x=points)
+    kmeans.fit(x=points, steps=10, batch_size=4)
    centers = normalize(kmeans.clusters())
    self.assertAllClose(np.sort(centers, axis=0),
                        np.sort(true_centers, axis=0))
@ -184,10 +180,8 @@ class KMeansTest(tf.test.TestCase):
                    initial_clusters=kmeans_ops.RANDOM_INIT,
                    distance_metric=kmeans_ops.COSINE_DISTANCE,
                    use_mini_batch=self.use_mini_batch,
-                    batch_size=8,
-                    continue_training=True,
-                    config=run_config.RunConfig(tf_random_seed=3))
-    kmeans.fit(x=points, steps=30)
+                    config=self.config(3))
+    kmeans.fit(x=points, steps=30, batch_size=8)

    centers = normalize(kmeans.clusters())
    self.assertAllClose(np.sort(centers, axis=0),
@ -195,7 +189,7 @@ class KMeansTest(tf.test.TestCase):
                        atol=1e-2)

    true_transform = 1 - cosine_similarity(points, centers)
-    transform = kmeans.transform(points)
+    transform = kmeans.transform(points, batch_size=8)
    self.assertAllClose(transform, true_transform, atol=1e-3)

  def test_predict_with_cosine_distance(self):
@ -217,20 +211,18 @@ class KMeansTest(tf.test.TestCase):
                    initial_clusters=kmeans_ops.RANDOM_INIT,
                    distance_metric=kmeans_ops.COSINE_DISTANCE,
                    use_mini_batch=self.use_mini_batch,
-                    batch_size=8,
-                    continue_training=True,
-                    config=run_config.RunConfig(tf_random_seed=3))
-    kmeans.fit(x=points, steps=30)
+                    config=self.config(3))
+    kmeans.fit(x=points, steps=30, batch_size=8)

    centers = normalize(kmeans.clusters())
    self.assertAllClose(np.sort(centers, axis=0),
                        np.sort(true_centers, axis=0), atol=1e-2)

-    assignments = kmeans.predict(points)
+    assignments = kmeans.predict(points, batch_size=8)
    self.assertAllClose(centers[assignments],
                        true_centers[true_assignments], atol=1e-2)

-    score = kmeans.score(points)
+    score = kmeans.score(points, batch_size=8)
    self.assertAllClose(score, true_score, atol=1e-2)

  def test_predict_with_cosine_distance_and_kmeans_plus_plus(self):
@ -254,21 +246,19 @@ class KMeansTest(tf.test.TestCase):
                    initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT,
                    distance_metric=kmeans_ops.COSINE_DISTANCE,
                    use_mini_batch=self.use_mini_batch,
-                    batch_size=12,
-                    continue_training=True,
-                    config=run_config.RunConfig(tf_random_seed=3))
-    kmeans.fit(x=points, steps=30)
+                    config=self.config(3))
+    kmeans.fit(x=points, steps=30, batch_size=12)

    centers = normalize(kmeans.clusters())
    self.assertAllClose(sorted(centers.tolist()),
                        sorted(true_centers.tolist()),
                        atol=1e-2)

-    assignments = kmeans.predict(points)
+    assignments = kmeans.predict(points, batch_size=12)
    self.assertAllClose(centers[assignments],
                        true_centers[true_assignments], atol=1e-2)

-    score = kmeans.score(points)
+    score = kmeans.score(points, batch_size=12)
    self.assertAllClose(score, true_score, atol=1e-2)

  def test_fit_raise_if_num_clusters_larger_than_num_points_random_init(self):
@ -276,7 +266,7 @@ class KMeansTest(tf.test.TestCase):

    with self.assertRaisesOpError('less'):
      kmeans = KMeans(num_clusters=3, initial_clusters=kmeans_ops.RANDOM_INIT)
-      kmeans.fit(x=points)
+      kmeans.fit(x=points, steps=10, batch_size=8)

  def test_fit_raise_if_num_clusters_larger_than_num_points_kmeans_plus_plus(
      self):
@ -285,7 +275,7 @@ class KMeansTest(tf.test.TestCase):
    with self.assertRaisesOpError(AssertionError):
      kmeans = KMeans(num_clusters=3,
                      initial_clusters=kmeans_ops.KMEANS_PLUS_PLUS_INIT)
-      kmeans.fit(x=points)
+      kmeans.fit(x=points, steps=10, batch_size=8)


 class MiniBatchKMeansTest(KMeansTest):
--- a/tensorflow/contrib/ios_examples/README.md
+++ b/tensorflow/contrib/ios_examples/README.md
@ -72,5 +72,14 @@ rundown:
   unused because no other code references the variables, but in fact their
   constructors have the important side effect of registering the class.
 
+ - C++11 support (or later) should be enabled by setting `C++ Language Dialect` to
+   `GNU++11` (or `GNU++14`), and `C++ Standard Library` to `libc++`.
+ 
 - The library doesn't currently support bitcode, so you'll need to disable that
   in your project settings.
+
+ - Remove any use of the `-all_load` flag in your project. The protocol buffers
+   libraries (full and lite versions) contain duplicate symbols, and the `-all_load`
+   flag will cause these duplicates to become link errors. If you were using
+   `-all_load` to avoid issues with Objective-C categories in static libraries,
+   you may be able to replace it with the `-ObjC` flag.
--- a/tensorflow/contrib/learn/python/learn/estimators/classifier.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/classifier.py
@ -47,7 +47,9 @@ class Classifier(estimator.Estimator):
    Args:
      model_fn: (targets, predictions, mode) -> logits, loss, train_op
      n_classes: Number of classes
-      model_dir: Base directory for output data
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
      config: Configuration object (optional)
    """
    self._n_classes = n_classes
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@ -119,7 +119,9 @@ class DNNClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
      feature_columns: An iterable containing all the feature columns used by
        the model. All items in the set should be instances of classes derived
        from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
      n_classes: number of target classes. Default is binary classification.
        It must be greater than 1.
      weight_column_name: A string defining feature column name representing
@ -277,7 +279,9 @@ class DNNRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
      feature_columns: An iterable containing all the feature columns used by
        the model. All items in the set should be instances of classes derived
        from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
      weight_column_name: A string defining feature column name representing
        weights. It is used to down weight or boost examples during training. It
        will be multiplied by the loss of the example.
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@ -72,7 +72,9 @@ class _DNNLinearCombinedBaseEstimator(estimator.BaseEstimator):

    Args:
      target_column: A _TargetColumn object.
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
      linear_feature_columns: An iterable containing all the feature columns
        used by linear part of the model. All items in the set should be
        instances of classes derived from `FeatureColumn`.
@ -354,7 +356,9 @@ class DNNLinearCombinedClassifier(_DNNLinearCombinedBaseEstimator):
    """Constructs a DNNLinearCombinedClassifier instance.

    Args:
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
      n_classes: number of target classes. Default is binary classification.
      weight_column_name: A string defining feature column name representing
        weights. It is used to down weight or boost examples during training.
@ -537,7 +541,9 @@ class DNNLinearCombinedRegressor(_DNNLinearCombinedBaseEstimator):
    """Initializes a DNNLinearCombinedRegressor instance.

    Args:
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
      weight_column_name: A string defining feature column name representing
        weights. It is used to down weight or boost examples during training. It
        will be multiplied by the loss of the example.
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@ -158,7 +158,9 @@ class BaseEstimator(sklearn.BaseEstimator):
    """Initializes a BaseEstimator instance.

    Args:
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
      config: A RunConfig instance.
    """
    # Model directory.
@ -766,7 +768,9 @@ class Estimator(BaseEstimator):
                 is passed to Estimator in `params` parameter. This allows
                 to configure Estimators from hyper parameter tunning.

-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
      config: Configuration object.
      params: `dict` of hyper parameters that will be passed into `model_fn`.
              Keys are names of parameters, values are basic python types.
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@ -122,7 +122,9 @@ class LinearClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
      feature_columns: An iterable containing all the feature columns used by
        the model. All items in the set should be instances of classes derived
        from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc.
+      model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
      n_classes: number of target classes. Default is binary classification.
      weight_column_name: A string defining feature column name representing
        weights. It is used to down weight or boost examples during training. It
@ -280,7 +282,9 @@ class LinearRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
      feature_columns: An iterable containing all the feature columns used by
        the model. All items in the set should be instances of classes derived
        from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph, etc.
+      model_dir: Directory to save model parameters, graph, etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
      weight_column_name: A string defining feature column name representing
        weights. It is used to down weight or boost examples during training. It
        will be multiplied by the loss of the example.
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
@ -57,7 +57,9 @@ class LogisticRegressor(estimator.Estimator):
        expects the returned predictions to be probabilities in [0.0, 1.0].
      thresholds: List of floating point thresholds to use for accuracy,
        precision, and recall metrics. If None, defaults to [0.5].
-      model_dir: Directory to save model parameters, graphs, etc.
+      model_dir: Directory to save model parameters, graphs, etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
      config: A RunConfig configuration object.
    """
    if thresholds is None:
--- a/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
@ -69,8 +69,7 @@ class TensorForestEstimator(estimator.BaseEstimator):
  def __init__(self, params, device_assigner=None, model_dir=None,
               graph_builder_class=tensor_forest.RandomForestGraphs,
               master='', accuracy_metric=None,
-               tf_random_seed=None, continue_training=False, verbose=1,
-               max_to_keep=5, save_checkpoint_secs=300):
+               tf_random_seed=None, config=None):
    self.params = params.fill()
    self.accuracy_metric = (accuracy_metric or
                            ('r2' if self.params.regression else 'accuracy'))
@ -81,12 +80,6 @@ class TensorForestEstimator(estimator.BaseEstimator):
    self.training_args = {}
    self.construction_args = {}

-    config = run_config.RunConfig(
-        master=master,
-        tf_random_seed=(tf_random_seed or int((time.time() * 1000) % 1000)),
-        save_checkpoints_secs=save_checkpoint_secs,
-        keep_checkpoint_max=max_to_keep)
-
    super(TensorForestEstimator, self).__init__(model_dir=model_dir,
                                                config=config)

--- a/tensorflow/contrib/learn/python/learn/estimators/svm.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/svm.py
@ -74,7 +74,9 @@ class SVM(linear.LinearClassifier):
    weight_column_name: A string defining feature column name representing
      weights. It is used to down weight or boost examples during training. It
      will be multiplied by the loss of the example.
-    model_dir: Directory to save model parameters, graph and etc.
+    model_dir: Directory to save model parameters, graph and etc. This can also
+        be used to load checkpoints from the directory into a estimator to continue
+        training a previously saved model.
    l1_regularization: L1-regularization parameter
    l2_regularization: L2-regularization parameter
    kernels: A list of kernels for the SVM. Currently, no kernels are supported.
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@ -38,29 +38,29 @@ HOST_OBJDIR := $(MAKEFILE_DIR)/gen/host_obj/
 HOST_BINDIR := $(MAKEFILE_DIR)/gen/host_bin/
 HOST_GENDIR := $(MAKEFILE_DIR)/gen/host_obj/

-# Find the current Eigen version name from the Bazel build file
-EIGEN_HASH := $(shell cat eigen.BUILD | grep archive_dir | head -1 | cut -f3 -d- | cut -f1 -d\")
+# Find the current Eigen version from the Bazel configuration
+EIGEN_VERSION := $(shell grep eigen_version tensorflow/workspace.bzl | head -1 | sed -e 's/.*eigen_version.*=.*"\(.*\)"/\1/')

 # Settings for the host compiler.
 HOST_CXX := $(CC_PREFIX) gcc
 HOST_CXXFLAGS := --std=c++11
-HOST_LDOPTS := \
-L/usr/local/lib
-
+HOST_LDOPTS := 
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_LDOPTS += -L$(MAKEFILE_DIR)/gen/protobuf-host/lib
 endif
+HOST_LDOPTS += -L/usr/local/lib

 HOST_INCLUDES := \
-I/usr/local/include \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_HASH) \
+-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_VERSION) \
 -I$(HOST_GENDIR)
-
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include
 endif
+# This is at the end so any globally-installed frameworks like protobuf don't
+# override local versions in the source tree.
+HOST_INCLUDES += -I/usr/local/include

 HOST_LIBS := \
 -lstdc++ \
@ -120,21 +120,18 @@ CXXFLAGS := --std=c++11 -DIS_SLIM_BUILD $(OPTFLAGS)
 LDFLAGS := \
 -L/usr/local/lib

-ifeq ($(HAS_GEN_HOST_PROTOC),true)
-	HOST_LDOPTS += -L$(MAKEFILE_DIR)/gen/protobuf-host/lib
-endif
-
 INCLUDES := \
-I/usr/local/include \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_HASH) \
+-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_VERSION) \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
-
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include
 endif
+# This is at the end so any globally-installed frameworks like protobuf don't
+# override local versions in the source tree.
+INCLUDES += -I/usr/local/include

 LIBS := \
 -lstdc++ \
@ -211,7 +208,7 @@ ifeq ($(TARGET),ANDROID)
 -I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi/include \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_HASH) \
+-I$(MAKEFILE_DIR)/downloads/eigen-eigen-$(EIGEN_VERSION) \
 -I$(MAKEFILE_DIR)/gen/protobuf/include \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
@ -364,7 +361,52 @@ BENCHMARK_NAME := $(BINDIR)benchmark

 # What sources we want to compile, derived from the main Bazel build using the
 # gen_file_lists.sh script.
-TF_CC_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_cc_files.txt)
+
+CORE_CC_ALL_SRCS := \
+$(wildcard tensorflow/core/*.cc) \
+$(wildcard tensorflow/core/common_runtime/*.cc) \
+$(wildcard tensorflow/core/debug/*.cc) \
+$(wildcard tensorflow/core/framework/*.cc) \
+$(wildcard tensorflow/core/graph/*.cc) \
+$(wildcard tensorflow/core/lib/*/*.cc) \
+$(wildcard tensorflow/core/platform/*.cc) \
+$(wildcard tensorflow/core/platform/*/*.cc) \
+$(wildcard tensorflow/core/util/*.cc) \
+$(wildcard tensorflow/core/util/*/*.cc)
+CORE_CC_EXCLUDE_SRCS := \
+$(wildcard tensorflow/core/*/*test.cc) \
+$(wildcard tensorflow/core/*/*testutil*) \
+$(wildcard tensorflow/core/*/*testlib*) \
+$(wildcard tensorflow/core/*/*main.cc) \
+$(wildcard tensorflow/core/*/*/*test.cc) \
+$(wildcard tensorflow/core/*/*/*testutil*) \
+$(wildcard tensorflow/core/*/*/*testlib*) \
+$(wildcard tensorflow/core/*/*/*main.cc) \
+$(wildcard tensorflow/core/graph/dot.*) \
+$(wildcard tensorflow/core/lib/gif/*) \
+$(wildcard tensorflow/core/lib/jpeg/*) \
+$(wildcard tensorflow/core/lib/png/*) \
+$(wildcard tensorflow/core/util/checkpoint_reader.*) \
+$(wildcard tensorflow/core/util/events_writer.*) \
+$(wildcard tensorflow/core/util/reporter.*) \
+$(wildcard tensorflow/core/util/tf_status_helper.*) \
+$(wildcard tensorflow/core/platform/default/stream_executor.*) \
+$(wildcard tensorflow/core/platform/default/test_benchmark.*) \
+$(wildcard tensorflow/core/platform/cuda.h) \
+$(wildcard tensorflow/core/platform/cloud/*) \
+$(wildcard tensorflow/core/platform/google/*) \
+$(wildcard tensorflow/core/platform/jpeg.*) \
+$(wildcard tensorflow/core/platform/png.*) \
+$(wildcard tensorflow/core/platform/stream_executor.*) \
+$(wildcard tensorflow/core/user_ops/*.cu.cc) \
+$(wildcard tensorflow/core/common_runtime/gpu/*) \
+$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*)
+# Filter out all the excluded files.
+TF_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
+# Add in any extra files that don't fit the patterns easily
+TF_CC_SRCS += tensorflow/core/common_runtime/gpu/gpu_tracer.cc
+# Also include the op and kernel definitions.
+TF_CC_SRCS += $(shell cat $(MAKEFILE_DIR)/tf_op_files.txt)
 PBT_CC_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_pb_text_files.txt)
 PROTO_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_proto_files.txt)
 BENCHMARK_SRCS := \
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@ -16,15 +16,15 @@ This static library will not contain:
 - Python or other language bindings
 - GPU support
 
- You can target:
- - iOS
- - OS X (macOS)
- - Android
- - Raspberry-PI
+You can target:
+- iOS
+- OS X (macOS)
+- Android
+- Raspberry-PI
 
- You will compile tensorflow and protobuf libraries that you can link into other
- applications.  You will also compile the [benchmark](../../tools/benchmark/)
- application that will let you check your application.
+You will compile tensorflow and protobuf libraries that you can link into other
+applications.  You will also compile the [benchmark](../../tools/benchmark/)
+application that will let you check your application.
 
 ## Before you start (all platforms)

@ -176,15 +176,16 @@ curl -o ~/graphs/inception.zip \

 ### Building all at once

-If you just want to get the libraries compiled in a hurry, you can run:
+If you just want to get the libraries compiled in a hurry, you can run this
+from the root of your TensorFlow source folder:

 ```bash
-build_all_ios.sh
+tensorflow/contrib/makefile/build_all_ios.sh
 ```

-and wait a long time.
+This process will take around twenty minutes on a modern MacBook Pro.

-When this completes, you will have a library for a single architecture and the
+When it completes, you will have a library for a single architecture and the
 benchmark program. Although successfully compiling the benchmark program is a
 sign of success, the program is not a complete iOS app.

@ -284,6 +285,17 @@ make -f tensorflow/contrib/makefile/Makefile HOST_OS=PI TARGET=PI \
 OPTFLAGS="-Os -mfpu=neon-vfpv4 -funsafe-math-optimizations -ftree-vectorize"
 ```

+If you hit compilation errors mentioning `__atomic_compare_exchange` and you're
+using gcc 4.9, you should try installing gcc 4.8 and using that instead:
+
+```bash
+sudo apt-get install -y gcc-4.8 g++-4.8
+make -f tensorflow/contrib/makefile/Makefile HOST_OS=PI TARGET=PI \
+OPTFLAGS="-Os -mfpu=neon-vfpv4 -funsafe-math-optimizations -ftree-vectorize" \
+CXX=g++-4.8
+```
+
+
 # Other notes

 ## Supported Systems
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@ -1,4 +1,4 @@
-#!/bin/bash -x
+#!/bin/bash -ex
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -15,11 +15,22 @@
 # ==============================================================================

 DOWNLOADS_DIR=tensorflow/contrib/makefile/downloads
+BZL_FILE_PATH=tensorflow/workspace.bzl

-mkdir ${DOWNLOADS_DIR}
+mkdir -p ${DOWNLOADS_DIR}

 # Grab the current Eigen version name from the Bazel build file
-EIGEN_HASH=$(cat eigen.BUILD | grep archive_dir | head -1 | cut -f3 -d- | cut -f1 -d\")
+EIGEN_HASH=$(cat "${BZL_FILE_PATH}" | egrep "eigen_version.*=.*\".*\"" | awk '{ print $3 }')
+# Trim trailing and preceding double quotes
+EIGEN_HASH="${EIGEN_HASH%\"}"
+EIGEN_HASH="${EIGEN_HASH#\"}"
+
+if [[ -z "${EIGEN_HASH}" ]]; then
+    echo >&2 "Eigen hash does not exist."
+    exit 1
+else
+    echo "Eigen hash = ${EIGEN_HASH}"
+fi

 curl "https://bitbucket.org/eigen/eigen/get/${EIGEN_HASH}.tar.gz" \
 -o /tmp/eigen-${EIGEN_HASH}.tar.gz
@ -34,3 +45,5 @@ git clone https://github.com/google/protobuf.git ${DOWNLOADS_DIR}/protobuf
 cd ${DOWNLOADS_DIR}
 rm -rf eigen-latest
 ln -s eigen-eigen-${EIGEN_HASH} eigen-latest
+
+echo "download_dependencies.sh completed successfully."
--- a/tensorflow/contrib/makefile/gen_file_lists.sh
+++ b/tensorflow/contrib/makefile/gen_file_lists.sh
@ -16,16 +16,6 @@
 # This script generates the source file lists needed by the makefile by querying
 # the master Bazel build configuration.

-bazel query 'kind("source file", deps(//tensorflow/core:android_tensorflow_lib))' | \
-grep "//tensorflow/.*\.cc$" | \
-grep -v "gen_proto_text" | \
-grep -E -v "jpeg" | \
-grep -E -v "png" | \
-grep -E -v "zlib" | \
-sed -E 's#^//##g' | \
-sed -E 's#:#/#g' \
-> tensorflow/contrib/makefile/tf_cc_files.txt
-
 bazel query 'kind("source file", deps(//tensorflow/core:android_tensorflow_lib))' | \
 grep "//tensorflow/.*\.proto$" | \
 sed -E 's#^//##g' | \
--- a/tensorflow/contrib/makefile/tf_cc_files.txt
+++ b/tensorflow/contrib/makefile/tf_cc_files.txt
@ -1,264 +0,0 @@
-tensorflow/core/kernels/xent_op.cc
-tensorflow/core/kernels/where_op.cc
-tensorflow/core/kernels/variable_ops.cc
-tensorflow/core/kernels/unpack_op.cc
-tensorflow/core/kernels/transpose_op.cc
-tensorflow/core/kernels/transpose_functor_cpu.cc
-tensorflow/core/kernels/training_ops.cc
-tensorflow/core/kernels/topk_op.cc
-tensorflow/core/kernels/tile_ops.cc
-tensorflow/core/kernels/strided_slice_op.cc
-tensorflow/core/kernels/stack_ops.cc
-tensorflow/core/kernels/split_op.cc
-tensorflow/core/kernels/split_lib_cpu.cc
-tensorflow/core/kernels/sparse_to_dense_op.cc
-tensorflow/core/kernels/softsign_op.cc
-tensorflow/core/kernels/softplus_op.cc
-tensorflow/core/kernels/softmax_op.cc
-tensorflow/core/kernels/slice_op.cc
-tensorflow/core/kernels/shape_ops.cc
-tensorflow/core/kernels/session_ops.cc
-tensorflow/core/kernels/sequence_ops.cc
-tensorflow/core/kernels/sendrecv_ops.cc
-tensorflow/core/kernels/save_restore_tensor.cc
-tensorflow/core/kernels/save_op.cc
-tensorflow/core/kernels/reverse_sequence_op.cc
-tensorflow/core/kernels/reverse_op.cc
-tensorflow/core/kernels/restore_op.cc
-tensorflow/core/kernels/resize_nearest_neighbor_op.cc
-tensorflow/core/kernels/resize_bilinear_op.cc
-tensorflow/core/kernels/reshape_op.cc
-tensorflow/core/kernels/relu_op.cc
-tensorflow/core/kernels/reduction_ops_sum.cc
-tensorflow/core/kernels/reduction_ops_prod.cc
-tensorflow/core/kernels/reduction_ops_min.cc
-tensorflow/core/kernels/reduction_ops_mean.cc
-tensorflow/core/kernels/reduction_ops_max.cc
-tensorflow/core/kernels/reduction_ops_common.cc
-tensorflow/core/kernels/pooling_ops_common.cc
-tensorflow/core/kernels/pad_op.cc
-tensorflow/core/kernels/pack_op.cc
-tensorflow/core/kernels/ops_util.cc
-tensorflow/core/kernels/no_op.cc
-tensorflow/core/kernels/maxpooling_op.cc
-tensorflow/core/kernels/matmul_op.cc
-tensorflow/core/kernels/lrn_op.cc
-tensorflow/core/kernels/in_topk_op.cc
-tensorflow/core/kernels/immutable_constant_op.cc
-tensorflow/core/kernels/identity_op.cc
-tensorflow/core/kernels/gather_op.cc
-tensorflow/core/kernels/fill_functor.cc
-tensorflow/core/kernels/example_parsing_ops.cc
-tensorflow/core/kernels/dynamic_stitch_op.cc
-tensorflow/core/kernels/dynamic_partition_op.cc
-tensorflow/core/kernels/dense_update_ops.cc
-tensorflow/core/kernels/cwise_ops_common.cc
-tensorflow/core/kernels/cwise_op_tanh.cc
-tensorflow/core/kernels/cwise_op_sub.cc
-tensorflow/core/kernels/cwise_op_squared_difference.cc
-tensorflow/core/kernels/cwise_op_square.cc
-tensorflow/core/kernels/cwise_op_sqrt.cc
-tensorflow/core/kernels/cwise_op_sigmoid.cc
-tensorflow/core/kernels/cwise_op_select.cc
-tensorflow/core/kernels/cwise_op_rsqrt.cc
-tensorflow/core/kernels/cwise_op_neg.cc
-tensorflow/core/kernels/cwise_op_mul.cc
-tensorflow/core/kernels/cwise_op_minimum.cc
-tensorflow/core/kernels/cwise_op_maximum.cc
-tensorflow/core/kernels/cwise_op_log.cc
-tensorflow/core/kernels/cwise_op_less.cc
-tensorflow/core/kernels/cwise_op_isfinite.cc
-tensorflow/core/kernels/cwise_op_inverse.cc
-tensorflow/core/kernels/cwise_op_greater.cc
-tensorflow/core/kernels/cwise_op_exp.cc
-tensorflow/core/kernels/cwise_op_equal_to.cc
-tensorflow/core/kernels/cwise_op_div.cc
-tensorflow/core/kernels/cwise_op_add.cc
-tensorflow/core/kernels/ctc_decoder_ops.cc
-tensorflow/core/kernels/conv_ops.cc
-tensorflow/core/kernels/conv_grad_ops.cc
-tensorflow/core/kernels/control_flow_ops.cc
-tensorflow/core/kernels/constant_op.cc
-tensorflow/core/kernels/concat_op.cc
-tensorflow/core/kernels/concat_lib_cpu.cc
-tensorflow/core/kernels/check_numerics_op.cc
-tensorflow/core/kernels/cast_op.cc
-tensorflow/core/kernels/bias_op.cc
-tensorflow/core/kernels/bcast_ops.cc
-tensorflow/core/kernels/batch_norm_op.cc
-tensorflow/core/kernels/avgpooling_op.cc
-tensorflow/core/kernels/argmax_op.cc
-tensorflow/core/kernels/aggregate_ops.cc
-tensorflow/core/util/work_sharder.cc
-tensorflow/core/util/util.cc
-tensorflow/core/util/use_cudnn.cc
-tensorflow/core/util/tensor_slice_writer.cc
-tensorflow/core/util/tensor_slice_set.cc
-tensorflow/core/util/tensor_slice_reader_cache.cc
-tensorflow/core/util/tensor_slice_reader.cc
-tensorflow/core/util/tensor_format.cc
-tensorflow/core/util/stat_summarizer.cc
-tensorflow/core/util/sparse/group_iterator.cc
-tensorflow/core/util/saved_tensor_slice_util.cc
-tensorflow/core/util/port.cc
-tensorflow/core/util/padding.cc
-tensorflow/core/util/mirror_pad_mode.cc
-tensorflow/core/util/memmapped_file_system_writer.cc
-tensorflow/core/util/memmapped_file_system.cc
-tensorflow/core/util/guarded_philox_random.cc
-tensorflow/core/util/example_proto_helper.cc
-tensorflow/core/util/device_name_utils.cc
-tensorflow/core/util/command_line_flags.cc
-tensorflow/core/util/bcast.cc
-tensorflow/core/platform/tracing.cc
-tensorflow/core/platform/tensor_coding.cc
-tensorflow/core/platform/protobuf_util.cc
-tensorflow/core/platform/posix/posix_file_system.cc
-tensorflow/core/platform/posix/port.cc
-tensorflow/core/platform/posix/env.cc
-tensorflow/core/platform/load_library.cc
-tensorflow/core/platform/file_system.cc
-tensorflow/core/platform/env.cc
-tensorflow/core/platform/denormal.cc
-tensorflow/core/platform/default/tracing.cc
-tensorflow/core/platform/default/logging.cc
-tensorflow/core/ops/training_ops.cc
-tensorflow/core/ops/string_ops.cc
-tensorflow/core/ops/state_ops.cc
-tensorflow/core/ops/sparse_ops.cc
-tensorflow/core/ops/sendrecv_ops.cc
-tensorflow/core/ops/script_ops.cc
-tensorflow/core/ops/random_ops.cc
-tensorflow/core/ops/random_grad.cc
-tensorflow/core/ops/parsing_ops.cc
-tensorflow/core/ops/no_op.cc
-tensorflow/core/ops/nn_ops.cc
-tensorflow/core/ops/nn_grad.cc
-tensorflow/core/ops/math_ops.cc
-tensorflow/core/ops/math_grad.cc
-tensorflow/core/ops/logging_ops.cc
-tensorflow/core/ops/linalg_ops.cc
-tensorflow/core/ops/io_ops.cc
-tensorflow/core/ops/image_ops.cc
-tensorflow/core/ops/functional_ops.cc
-tensorflow/core/ops/functional_grad.cc
-tensorflow/core/ops/function_ops.cc
-tensorflow/core/ops/data_flow_ops.cc
-tensorflow/core/ops/ctc_ops.cc
-tensorflow/core/ops/control_flow_ops.cc
-tensorflow/core/ops/candidate_sampling_ops.cc
-tensorflow/core/ops/array_ops.cc
-tensorflow/core/ops/array_grad.cc
-tensorflow/core/lib/wav/wav_io.cc
-tensorflow/core/lib/strings/stringprintf.cc
-tensorflow/core/lib/strings/strcat.cc
-tensorflow/core/lib/strings/str_util.cc
-tensorflow/core/lib/strings/scanner.cc
-tensorflow/core/lib/strings/proto_text_util.cc
-tensorflow/core/lib/strings/ordered_code.cc
-tensorflow/core/lib/strings/numbers.cc
-tensorflow/core/lib/random/weighted_picker.cc
-tensorflow/core/lib/random/simple_philox.cc
-tensorflow/core/lib/random/random.cc
-tensorflow/core/lib/random/distribution_sampler.cc
-tensorflow/core/lib/io/two_level_iterator.cc
-tensorflow/core/lib/io/table_builder.cc
-tensorflow/core/lib/io/table.cc
-tensorflow/core/lib/io/record_writer.cc
-tensorflow/core/lib/io/record_reader.cc
-tensorflow/core/lib/io/path.cc
-tensorflow/core/lib/io/match.cc
-tensorflow/core/lib/io/iterator.cc
-tensorflow/core/lib/io/inputbuffer.cc
-tensorflow/core/lib/io/format.cc
-tensorflow/core/lib/io/block_builder.cc
-tensorflow/core/lib/io/block.cc
-tensorflow/core/lib/histogram/histogram.cc
-tensorflow/core/lib/hash/hash.cc
-tensorflow/core/lib/hash/crc32c.cc
-tensorflow/core/lib/core/threadpool.cc
-tensorflow/core/lib/core/stringpiece.cc
-tensorflow/core/lib/core/status.cc
-tensorflow/core/lib/core/coding.cc
-tensorflow/core/lib/core/arena.cc
-tensorflow/core/graph/validate.cc
-tensorflow/core/graph/tensor_id.cc
-tensorflow/core/graph/subgraph.cc
-tensorflow/core/graph/quantize_training.cc
-tensorflow/core/graph/optimizer_cse.cc
-tensorflow/core/graph/node_builder.cc
-tensorflow/core/graph/graph_partition.cc
-tensorflow/core/graph/graph_def_builder.cc
-tensorflow/core/graph/graph_constructor.cc
-tensorflow/core/graph/graph.cc
-tensorflow/core/graph/gradients.cc
-tensorflow/core/graph/equal_graph_def.cc
-tensorflow/core/graph/edgeset.cc
-tensorflow/core/graph/costmodel.cc
-tensorflow/core/graph/colors.cc
-tensorflow/core/graph/algorithm.cc
-tensorflow/core/framework/versions.cc
-tensorflow/core/framework/unique_tensor_references.cc
-tensorflow/core/framework/types.cc
-tensorflow/core/framework/tracking_allocator.cc
-tensorflow/core/framework/tensor_util.cc
-tensorflow/core/framework/tensor_slice.cc
-tensorflow/core/framework/tensor_shape.cc
-tensorflow/core/framework/tensor_reference.cc
-tensorflow/core/framework/tensor.cc
-tensorflow/core/framework/shape_inference.cc
-tensorflow/core/framework/resource_mgr.cc
-tensorflow/core/framework/rendezvous.cc
-tensorflow/core/framework/reader_op_kernel.cc
-tensorflow/core/framework/partial_tensor_shape.cc
-tensorflow/core/framework/op_segment.cc
-tensorflow/core/framework/op_kernel.cc
-tensorflow/core/framework/op_gen_lib.cc
-tensorflow/core/framework/op_def_util.cc
-tensorflow/core/framework/op_def_builder.cc
-tensorflow/core/framework/op.cc
-tensorflow/core/framework/node_def_util.cc
-tensorflow/core/framework/node_def_builder.cc
-tensorflow/core/framework/memory_types.cc
-tensorflow/core/framework/lookup_interface.cc
-tensorflow/core/framework/log_memory.cc
-tensorflow/core/framework/load_library.cc
-tensorflow/core/framework/kernel_def_builder.cc
-tensorflow/core/framework/graph_def_util.cc
-tensorflow/core/framework/function.cc
-tensorflow/core/framework/fake_input.cc
-tensorflow/core/framework/device_base.cc
-tensorflow/core/framework/common_shape_fns.cc
-tensorflow/core/framework/cancellation.cc
-tensorflow/core/framework/bfloat16.cc
-tensorflow/core/framework/attr_value_util.cc
-tensorflow/core/framework/allocator.cc
-tensorflow/core/common_runtime/threadpool_device_factory.cc
-tensorflow/core/common_runtime/threadpool_device.cc
-tensorflow/core/common_runtime/step_stats_collector.cc
-tensorflow/core/common_runtime/simple_placer.cc
-tensorflow/core/common_runtime/simple_graph_execution_state.cc
-tensorflow/core/common_runtime/session_state.cc
-tensorflow/core/common_runtime/session_options.cc
-tensorflow/core/common_runtime/session_factory.cc
-tensorflow/core/common_runtime/session.cc
-tensorflow/core/common_runtime/rendezvous_mgr.cc
-tensorflow/core/common_runtime/process_util.cc
-tensorflow/core/common_runtime/memory_types.cc
-tensorflow/core/common_runtime/local_device.cc
-tensorflow/core/common_runtime/graph_optimizer.cc
-tensorflow/core/common_runtime/gpu/gpu_tracer.cc
-tensorflow/core/common_runtime/function.cc
-tensorflow/core/common_runtime/executor.cc
-tensorflow/core/common_runtime/direct_session.cc
-tensorflow/core/common_runtime/device_set.cc
-tensorflow/core/common_runtime/device_mgr.cc
-tensorflow/core/common_runtime/device_factory.cc
-tensorflow/core/common_runtime/device.cc
-tensorflow/core/common_runtime/costmodel_manager.cc
-tensorflow/core/common_runtime/copy_tensor.cc
-tensorflow/core/common_runtime/constant_folding.cc
-tensorflow/core/common_runtime/build_graph_options.cc
-tensorflow/core/common_runtime/bfc_allocator.cc
-tensorflow/core/common_runtime/allocator_retry.cc
-tensorflow/core/client/tensor_c_api.cc
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@ -0,0 +1,124 @@
+tensorflow/core/kernels/xent_op.cc
+tensorflow/core/kernels/where_op.cc
+tensorflow/core/kernels/variable_ops.cc
+tensorflow/core/kernels/unpack_op.cc
+tensorflow/core/kernels/transpose_op.cc
+tensorflow/core/kernels/transpose_functor_cpu.cc
+tensorflow/core/kernels/training_ops.cc
+tensorflow/core/kernels/topk_op.cc
+tensorflow/core/kernels/tile_ops.cc
+tensorflow/core/kernels/strided_slice_op_inst_6.cc
+tensorflow/core/kernels/strided_slice_op_inst_5.cc
+tensorflow/core/kernels/strided_slice_op_inst_4.cc
+tensorflow/core/kernels/strided_slice_op_inst_3.cc
+tensorflow/core/kernels/strided_slice_op_inst_2.cc
+tensorflow/core/kernels/strided_slice_op_inst_1.cc
+tensorflow/core/kernels/strided_slice_op.cc
+tensorflow/core/kernels/stack_ops.cc
+tensorflow/core/kernels/split_op.cc
+tensorflow/core/kernels/split_lib_cpu.cc
+tensorflow/core/kernels/sparse_to_dense_op.cc
+tensorflow/core/kernels/softsign_op.cc
+tensorflow/core/kernels/softplus_op.cc
+tensorflow/core/kernels/softmax_op.cc
+tensorflow/core/kernels/slice_op.cc
+tensorflow/core/kernels/shape_ops.cc
+tensorflow/core/kernels/session_ops.cc
+tensorflow/core/kernels/sequence_ops.cc
+tensorflow/core/kernels/sendrecv_ops.cc
+tensorflow/core/kernels/save_restore_tensor.cc
+tensorflow/core/kernels/save_op.cc
+tensorflow/core/kernels/reverse_sequence_op.cc
+tensorflow/core/kernels/reverse_op.cc
+tensorflow/core/kernels/restore_op.cc
+tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+tensorflow/core/kernels/resize_bilinear_op.cc
+tensorflow/core/kernels/reshape_op.cc
+tensorflow/core/kernels/relu_op.cc
+tensorflow/core/kernels/reduction_ops_sum.cc
+tensorflow/core/kernels/reduction_ops_prod.cc
+tensorflow/core/kernels/reduction_ops_min.cc
+tensorflow/core/kernels/reduction_ops_mean.cc
+tensorflow/core/kernels/reduction_ops_max.cc
+tensorflow/core/kernels/reduction_ops_common.cc
+tensorflow/core/kernels/pooling_ops_common.cc
+tensorflow/core/kernels/pad_op.cc
+tensorflow/core/kernels/pack_op.cc
+tensorflow/core/kernels/ops_util.cc
+tensorflow/core/kernels/no_op.cc
+tensorflow/core/kernels/maxpooling_op.cc
+tensorflow/core/kernels/matmul_op.cc
+tensorflow/core/kernels/lrn_op.cc
+tensorflow/core/kernels/in_topk_op.cc
+tensorflow/core/kernels/immutable_constant_op.cc
+tensorflow/core/kernels/identity_op.cc
+tensorflow/core/kernels/gather_op.cc
+tensorflow/core/kernels/fill_functor.cc
+tensorflow/core/kernels/example_parsing_ops.cc
+tensorflow/core/kernels/dynamic_stitch_op.cc
+tensorflow/core/kernels/dynamic_partition_op.cc
+tensorflow/core/kernels/dense_update_ops.cc
+tensorflow/core/kernels/cwise_ops_common.cc
+tensorflow/core/kernels/cwise_op_tanh.cc
+tensorflow/core/kernels/cwise_op_sub.cc
+tensorflow/core/kernels/cwise_op_squared_difference.cc
+tensorflow/core/kernels/cwise_op_square.cc
+tensorflow/core/kernels/cwise_op_sqrt.cc
+tensorflow/core/kernels/cwise_op_sigmoid.cc
+tensorflow/core/kernels/cwise_op_select.cc
+tensorflow/core/kernels/cwise_op_rsqrt.cc
+tensorflow/core/kernels/cwise_op_neg.cc
+tensorflow/core/kernels/cwise_op_mul.cc
+tensorflow/core/kernels/cwise_op_minimum.cc
+tensorflow/core/kernels/cwise_op_maximum.cc
+tensorflow/core/kernels/cwise_op_log.cc
+tensorflow/core/kernels/cwise_op_less.cc
+tensorflow/core/kernels/cwise_op_isfinite.cc
+tensorflow/core/kernels/cwise_op_inverse.cc
+tensorflow/core/kernels/cwise_op_greater.cc
+tensorflow/core/kernels/cwise_op_exp.cc
+tensorflow/core/kernels/cwise_op_equal_to.cc
+tensorflow/core/kernels/cwise_op_div.cc
+tensorflow/core/kernels/cwise_op_add.cc
+tensorflow/core/kernels/ctc_decoder_ops.cc
+tensorflow/core/kernels/conv_ops.cc
+tensorflow/core/kernels/conv_grad_ops.cc
+tensorflow/core/kernels/control_flow_ops.cc
+tensorflow/core/kernels/constant_op.cc
+tensorflow/core/kernels/concat_op.cc
+tensorflow/core/kernels/concat_lib_cpu.cc
+tensorflow/core/kernels/check_numerics_op.cc
+tensorflow/core/kernels/cast_op.cc
+tensorflow/core/kernels/bias_op.cc
+tensorflow/core/kernels/bcast_ops.cc
+tensorflow/core/kernels/batch_norm_op.cc
+tensorflow/core/kernels/avgpooling_op.cc
+tensorflow/core/kernels/argmax_op.cc
+tensorflow/core/kernels/aggregate_ops.cc
+tensorflow/core/ops/training_ops.cc
+tensorflow/core/ops/string_ops.cc
+tensorflow/core/ops/state_ops.cc
+tensorflow/core/ops/sparse_ops.cc
+tensorflow/core/ops/sendrecv_ops.cc
+tensorflow/core/ops/script_ops.cc
+tensorflow/core/ops/random_ops.cc
+tensorflow/core/ops/random_grad.cc
+tensorflow/core/ops/parsing_ops.cc
+tensorflow/core/ops/no_op.cc
+tensorflow/core/ops/nn_ops.cc
+tensorflow/core/ops/nn_grad.cc
+tensorflow/core/ops/math_ops.cc
+tensorflow/core/ops/math_grad.cc
+tensorflow/core/ops/logging_ops.cc
+tensorflow/core/ops/linalg_ops.cc
+tensorflow/core/ops/io_ops.cc
+tensorflow/core/ops/image_ops.cc
+tensorflow/core/ops/functional_ops.cc
+tensorflow/core/ops/functional_grad.cc
+tensorflow/core/ops/function_ops.cc
+tensorflow/core/ops/data_flow_ops.cc
+tensorflow/core/ops/ctc_ops.cc
+tensorflow/core/ops/control_flow_ops.cc
+tensorflow/core/ops/candidate_sampling_ops.cc
+tensorflow/core/ops/array_ops.cc
+tensorflow/core/ops/array_grad.cc
--- a/tensorflow/contrib/quantization/BUILD
+++ b/tensorflow/contrib/quantization/BUILD
@ -69,6 +69,8 @@ py_library(
    srcs_version = "PY2AND3",
    deps = [
        ":ops",
+        "//tensorflow/contrib/quantization:quantized_ops_py",
+        "//tensorflow/contrib/quantization/kernels:quantized_kernels_py",
    ],
 )

--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -603,6 +603,7 @@ filegroup(
            "graph/dot.*",
            "lib/jpeg/**/*",
            "lib/png/**/*",
+            "lib/gif/**/*",
            "util/checkpoint_reader.*",
            "util/events_writer.*",
            "util/reporter.*",
@ -613,6 +614,7 @@ filegroup(
            "platform/google/**/*",
            "platform/jpeg.*",
            "platform/png.*",
+            "platform/gif.*",
            "platform/stream_executor.*",
            "user_ops/**/*.cu.cc",
            "common_runtime/gpu/**/*",
@ -843,6 +845,7 @@ cc_library(
    hdrs = [
        "lib/core/blocking_counter.h",
        "lib/core/refcount.h",
+        "lib/gif/gif_io.h",
        "lib/gtl/edit_distance.h",
        "lib/gtl/int_type.h",
        "lib/gtl/iterator_range.h",
@ -1967,6 +1970,10 @@ filegroup(
        "lib/jpeg/testdata/corrupt34_3.jpg",
        # -- hand-edited variant: stops after a restart marker
        "lib/jpeg/testdata/corrupt34_4.jpg",
+        # GIF data
+        "lib/gif/testdata/scan.gif",
+        # GIF data with optimization
+        "lib/gif/testdata/optimized.gif",
    ],
 )

--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -859,6 +859,7 @@ tf_kernel_libraries(
        "crop_and_resize_op",
        "decode_jpeg_op",
        "decode_png_op",
+        "decode_gif_op",
        "draw_bounding_box_op",
        "encode_jpeg_op",
        "attention_ops",
@ -1108,6 +1109,7 @@ tf_kernel_libraries(
        "matmul_op",
        "reduction_ops",
        "segment_reduction_ops",
+        "scan_ops",
        "sequence_ops",
        "sparse_matmul_op",
    ],
@ -2040,6 +2042,7 @@ filegroup(
            "decode_png_op.*",
            "encode_jpeg_op.*",
            "decode_jpeg_op.*",
+            "decode_gif_op.*",
            "identity_reader_op.*",
            "reader_base.*",
            "fixed_length_record_reader_op.*",
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@ -36,7 +36,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;

-template <typename Device>
+template <typename Device, typename T>
 class RGBToHSVOp : public OpKernel {
 public:
  explicit RGBToHSVOp(OpKernelConstruction* context) : OpKernel(context) {}
@ -59,23 +59,23 @@ class RGBToHSVOp : public OpKernel {

    // Make a canonical image, maintaining the last (channel) dimension, while
    // flattening all others do give the functor easy to work with data.
-    TTypes<float, 2>::ConstTensor input_data = input.flat_inner_dims<float>();
-    TTypes<float, 2>::Tensor output_data = output->flat_inner_dims<float>();
+    typename TTypes<T, 2>::ConstTensor input_data = input.flat_inner_dims<T>();
+    typename TTypes<T, 2>::Tensor output_data = output->flat_inner_dims<T>();

    Tensor trange;
    OP_REQUIRES_OK(
-        context, context->allocate_temp(DataTypeToEnum<float>::value,
+        context, context->allocate_temp(DataTypeToEnum<T>::value,
                                        TensorShape({input_data.dimension(0)}),
                                        &trange));

-    TTypes<float, 1>::Tensor range = trange.tensor<float, 1>();
+    typename TTypes<T, 1>::Tensor range = trange.tensor<T, 1>();

-    functor::RGBToHSV<Device>()(context->eigen_device<Device>(), input_data,
-                                range, output_data);
+    functor::RGBToHSV<Device, T>()(context->eigen_device<Device>(), input_data,
+                                   range, output_data);
  }
 };

-template <typename Device>
+template <typename Device, typename T>
 class HSVToRGBOp : public OpKernel {
 public:
  explicit HSVToRGBOp(OpKernelConstruction* context) : OpKernel(context) {}
@ -96,41 +96,54 @@ class HSVToRGBOp : public OpKernel {
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, input.shape(), &output));

-    TTypes<float, 2>::ConstTensor input_data = input.flat_inner_dims<float>();
-    TTypes<float, 2>::Tensor output_data = output->flat_inner_dims<float>();
+    typename TTypes<T, 2>::ConstTensor input_data = input.flat_inner_dims<T>();
+    typename TTypes<T, 2>::Tensor output_data = output->flat_inner_dims<T>();

-    functor::HSVToRGB<Device>()(context->eigen_device<Device>(), input_data,
-                                output_data);
+    functor::HSVToRGB<Device, T>()(context->eigen_device<Device>(), input_data,
+                                   output_data);
  }
 };

-REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU),
-                        RGBToHSVOp<CPUDevice>);
-template class RGBToHSVOp<CPUDevice>;
-REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU),
-                        HSVToRGBOp<CPUDevice>);
-template class HSVToRGBOp<CPUDevice>;
+#define REGISTER_CPU(T)                                       \
+  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \
+                              .TypeConstraint<T>("T"),        \
+                          RGBToHSVOp<CPUDevice, T>);          \
+  template class RGBToHSVOp<CPUDevice, T>;                    \
+  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \
+                              .TypeConstraint<T>("T"),        \
+                          HSVToRGBOp<CPUDevice, T>);          \
+  template class HSVToRGBOp<CPUDevice, T>;
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);

 #if GOOGLE_CUDA
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
 namespace functor {
-template <>
-void RGBToHSV<GPUDevice>::operator()(const GPUDevice& d,
-                                     TTypes<float, 2>::ConstTensor input_data,
-                                     TTypes<float, 1>::Tensor range,
-                                     TTypes<float, 2>::Tensor output_data);
-extern template struct RGBToHSV<GPUDevice>;
-template <>
-void HSVToRGB<GPUDevice>::operator()(const GPUDevice& d,
-                                     TTypes<float, 2>::ConstTensor input_data,
-                                     TTypes<float, 2>::Tensor output_data);
-extern template struct HSVToRGB<GPUDevice>;
+#define DECLARE_GPU(T)                                        \
+  template <>                                                 \
+  void RGBToHSV<GPUDevice, T>::operator()(const GPUDevice& d, \
+      TTypes<T, 2>::ConstTensor input_data,                   \
+      TTypes<T, 1>::Tensor range,                             \
+      TTypes<T, 2>::Tensor output_data);                      \
+  extern template struct RGBToHSV<GPUDevice, T>;              \
+  template <>                                                 \
+  void HSVToRGB<GPUDevice, T>::operator()(const GPUDevice& d, \
+      TTypes<T, 2>::ConstTensor input_data,                   \
+      TTypes<T, 2>::Tensor output_data);                      \
+  extern template struct HSVToRGB<GPUDevice, T>;
+TF_CALL_float(DECLARE_GPU);
+TF_CALL_double(DECLARE_GPU);
 }  // namespace functor
-REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU),
-                        RGBToHSVOp<GPUDevice>);
-REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU),
-                        HSVToRGBOp<GPUDevice>);
+#define REGISTER_GPU(T)                                       \
+  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \
+                              .TypeConstraint<T>("T"),        \
+                          RGBToHSVOp<GPUDevice, T>);          \
+  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \
+                              .TypeConstraint<T>("T"),        \
+                          HSVToRGBOp<GPUDevice, T>);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
 #endif

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/colorspace_op.h
+++ b/tensorflow/core/kernels/colorspace_op.h
@ -24,18 +24,19 @@ namespace tensorflow {

 namespace functor {

-template <typename Device>
+template <typename Device, typename T>
 struct RGBToHSV {
-  void operator()(const Device &d, TTypes<float, 2>::ConstTensor input_data,
-                  TTypes<float, 1>::Tensor range,
-                  TTypes<float, 2>::Tensor output_data) {
-    auto H = output_data.chip<1>(0);
-    auto S = output_data.chip<1>(1);
-    auto V = output_data.chip<1>(2);
+  void operator()(const Device &d,
+                  typename TTypes<T, 2>::ConstTensor input_data,
+                  typename TTypes<T, 1>::Tensor range,
+                  typename TTypes<T, 2>::Tensor output_data) {
+    auto H = output_data.template chip<1>(0);
+    auto S = output_data.template chip<1>(1);
+    auto V = output_data.template chip<1>(2);

-    auto R = input_data.chip<1>(0);
-    auto G = input_data.chip<1>(1);
-    auto B = input_data.chip<1>(2);
+    auto R = input_data.template chip<1>(0);
+    auto G = input_data.template chip<1>(1);
+    auto B = input_data.template chip<1>(2);

 #if !defined(EIGEN_HAS_INDEX_LIST)
    Eigen::array<int, 1> channel_axis{{1}};
@ -47,38 +48,40 @@ struct RGBToHSV {

    range.device(d) = V - input_data.minimum(channel_axis);

-    S.device(d) = (V > 0.f).select(range / V, V.constant(0.f));
+    S.device(d) = (V > T(0)).select(range / V, V.constant(T(0)));

-    auto norm = range.inverse() * (1.f / 6.f);
+    auto norm = range.inverse() * (T(1) / T(6));
    // TODO(wicke): all these assignments are only necessary because a combined
    // expression is larger than kernel parameter space. A custom kernel is
    // probably in order.
    H.device(d) = (R == V).select(norm * (G - B),
-                                  (G == V).select(norm * (B - R) + 2.f / 6.f,
-                                                  norm * (R - G) + 4.f / 6.f));
-    H.device(d) = (range > 0.f).select(H, H.constant(0.f));
-    H.device(d) = (H < 0.f).select(H + 1.f, H);
+                                  (G == V).select(
+                                      norm * (B - R) + T(2) / T(6),
+                                      norm * (R - G) + T(4) / T(6)));
+    H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
+    H.device(d) = (H < T(0)).select(H + T(1), H);
  }
 };

-template <typename Device>
+template <typename Device, typename T>
 struct HSVToRGB {
-  void operator()(const Device &d, TTypes<float, 2>::ConstTensor input_data,
-                  TTypes<float, 2>::Tensor output_data) {
-    auto H = input_data.chip<1>(0);
-    auto S = input_data.chip<1>(1);
-    auto V = input_data.chip<1>(2);
+  void operator()(const Device &d,
+                  typename TTypes<T, 2>::ConstTensor input_data,
+                  typename TTypes<T, 2>::Tensor output_data) {
+    auto H = input_data.template chip<1>(0);
+    auto S = input_data.template chip<1>(1);
+    auto V = input_data.template chip<1>(2);

    // TODO(wicke): compute only the fractional part of H for robustness
-    auto dh = H * 6.f;
-    auto dr = ((dh - 3.f).abs() - 1.f).cwiseMax(0.f).cwiseMin(1.f);
-    auto dg = (-(dh - 2.f).abs() + 2.f).cwiseMax(0.f).cwiseMin(1.f);
-    auto db = (-(dh - 4.f).abs() + 2.f).cwiseMax(0.f).cwiseMin(1.f);
-    auto one_s = -S + 1.f;
+    auto dh = H * T(6);
+    auto dr = ((dh - T(3)).abs() - T(1)).cwiseMax(T(0)).cwiseMin(T(1));
+    auto dg = (-(dh - T(2)).abs() + T(2)).cwiseMax(T(0)).cwiseMin(T(1));
+    auto db = (-(dh - T(4)).abs() + T(2)).cwiseMax(T(0)).cwiseMin(T(1));
+    auto one_s = -S + T(1);

-    auto R = output_data.chip<1>(0);
-    auto G = output_data.chip<1>(1);
-    auto B = output_data.chip<1>(2);
+    auto R = output_data.template chip<1>(0);
+    auto G = output_data.template chip<1>(1);
+    auto B = output_data.template chip<1>(2);

    R.device(d) = (one_s + S * dr) * V;
    G.device(d) = (one_s + S * dg) * V;
--- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
@ -24,8 +24,11 @@ namespace tensorflow {

 typedef Eigen::GpuDevice GPUDevice;

-template class functor::RGBToHSV<GPUDevice>;
-template class functor::HSVToRGB<GPUDevice>;
+#define INSTANTIATE_GPU(T)                        \
+  template class functor::RGBToHSV<GPUDevice, T>; \
+  template class functor::HSVToRGB<GPUDevice, T>;
+TF_CALL_float(INSTANTIATE_GPU);
+TF_CALL_double(INSTANTIATE_GPU);
 }

 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/colorspace_op_test.cc
+++ b/tensorflow/core/kernels/colorspace_op_test.cc
@ -29,183 +29,241 @@ limitations under the License.

 namespace tensorflow {

+template <typename T>
 class RGBToHSVOpTest : public OpsTestBase {
 protected:
-  RGBToHSVOpTest() {
+  void MakeOp(DataType data_type) {
    TF_EXPECT_OK(NodeDefBuilder("rgb_to_hsv_op", "RGBToHSV")
-                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(data_type))
                     .Finalize(node_def()));
    TF_EXPECT_OK(InitOp());
  }
+
+  void CheckBlack(DataType data_type) {
+    // Black pixel should map to hsv = [0,0,0]
+    AddInputFromArray<T>(TensorShape({3}), {0, 0, 0});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {0.0, 0.0, 0.0});
+    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
+  }
+
+  void CheckGray(DataType data_type) {
+    // Gray pixel should have hue = saturation = 0.0, value = r/255
+    AddInputFromArray<T>(TensorShape({3}), {.5, .5, .5});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {0.0, 0.0, .5});
+    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
+  }
+
+  void CheckWhite(DataType data_type) {
+    // Gray pixel should have hue = saturation = 0.0, value = 1.0
+    AddInputFromArray<T>(TensorShape({3}), {1, 1, 1});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {0.0, 0.0, 1.0});
+    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
+  }
+
+  void CheckRedMax(DataType data_type) {
+    // Test case where red channel dominates
+    AddInputFromArray<T>(TensorShape({3}), {.8, .4, .2});
+    TF_ASSERT_OK(RunOpKernel());
+
+    T expected_h = 1. / 6. * .2 / .6;
+    T expected_s = .6 / .8;
+    T expected_v = .8 / 1.;
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {expected_h, expected_s, expected_v});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
+
+  void CheckGreenMax(DataType data_type) {
+    // Test case where green channel dominates
+    AddInputFromArray<T>(TensorShape({3}), {.2, .8, .4});
+    TF_ASSERT_OK(RunOpKernel());
+
+    T expected_h = 1. / 6. * (2.0 + (.2 / .6));
+    T expected_s = .6 / .8;
+    T expected_v = .8 / 1.;
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {expected_h, expected_s, expected_v});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
+
+  void CheckBlueMax(DataType data_type) {
+    // Test case where blue channel dominates
+    AddInputFromArray<T>(TensorShape({3}), {.4, .2, .8});
+    TF_ASSERT_OK(RunOpKernel());
+
+    T expected_h = 1. / 6. * (4.0 + (.2 / .6));
+    T expected_s = .6 / .8;
+    T expected_v = .8 / 1.;
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {expected_h, expected_s, expected_v});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
+
+  void CheckNegativeDifference(DataType data_type) {
+    AddInputFromArray<T>(TensorShape({3}), {0, .1, .2});
+    TF_ASSERT_OK(RunOpKernel());
+
+    T expected_h = 1. / 6. * (4.0 + (-.1 / .2));
+    T expected_s = .2 / .2;
+    T expected_v = .2 / 1.;
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {expected_h, expected_s, expected_v});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
 };

-TEST_F(RGBToHSVOpTest, CheckBlack) {
-  // Black pixel should map to hsv = [0,0,0]
-  AddInputFromArray<float>(TensorShape({3}), {0, 0, 0});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {0.0, 0.0, 0.0});
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
-}
-
-TEST_F(RGBToHSVOpTest, CheckGray) {
-  // Gray pixel should have hue = saturation = 0.0, value = r/255
-  AddInputFromArray<float>(TensorShape({3}), {.5, .5, .5});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {0.0, 0.0, .5});
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
-}
-
-TEST_F(RGBToHSVOpTest, CheckWhite) {
-  // Gray pixel should have hue = saturation = 0.0, value = 1.0
-  AddInputFromArray<float>(TensorShape({3}), {1, 1, 1});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {0.0, 0.0, 1.0});
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
-}
-
-TEST_F(RGBToHSVOpTest, CheckRedMax) {
-  // Test case where red channel dominates
-  AddInputFromArray<float>(TensorShape({3}), {.8, .4, .2});
-  TF_ASSERT_OK(RunOpKernel());
-
-  float expected_h = 1. / 6. * .2 / .6;
-  float expected_s = .6 / .8;
-  float expected_v = .8 / 1.;
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {expected_h, expected_s, expected_v});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
-}
-
-TEST_F(RGBToHSVOpTest, CheckGreenMax) {
-  // Test case where green channel dominates
-  AddInputFromArray<float>(TensorShape({3}), {.2, .8, .4});
-  TF_ASSERT_OK(RunOpKernel());
-
-  float expected_h = 1. / 6. * (2.0 + (.2 / .6));
-  float expected_s = .6 / .8;
-  float expected_v = .8 / 1.;
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {expected_h, expected_s, expected_v});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
-}
-
-TEST_F(RGBToHSVOpTest, CheckBlueMax) {
-  // Test case where blue channel dominates
-  AddInputFromArray<float>(TensorShape({3}), {.4, .2, .8});
-  TF_ASSERT_OK(RunOpKernel());
-
-  float expected_h = 1. / 6. * (4.0 + (.2 / .6));
-  float expected_s = .6 / .8;
-  float expected_v = .8 / 1.;
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {expected_h, expected_s, expected_v});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
-}
-
-TEST_F(RGBToHSVOpTest, CheckNegativeDifference) {
-  AddInputFromArray<float>(TensorShape({3}), {0, .1, .2});
-  TF_ASSERT_OK(RunOpKernel());
-
-  float expected_h = 1. / 6. * (4.0 + (-.1 / .2));
-  float expected_s = .2 / .2;
-  float expected_v = .2 / 1.;
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {expected_h, expected_s, expected_v});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
-}
-
+template <typename T>
 class HSVToRGBOpTest : public OpsTestBase {
 protected:
-  HSVToRGBOpTest() {
+  void MakeOp(DataType data_type) {
    TF_EXPECT_OK(NodeDefBuilder("hsv_to_rgb_op", "HSVToRGB")
-                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(data_type))
                     .Finalize(node_def()));
    TF_EXPECT_OK(InitOp());
  }
+
+  void CheckBlack(DataType data_type) {
+    // Black pixel should map to rgb = [0,0,0]
+    AddInputFromArray<T>(TensorShape({3}), {0.0, 0.0, 0.0});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {0, 0, 0});
+    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
+  }
+
+  void CheckGray(DataType data_type) {
+    // Gray pixel should have hue = saturation = 0.0, value = r/255
+    AddInputFromArray<T>(TensorShape({3}), {0.0, 0.0, .5});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {.5, .5, .5});
+    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
+  }
+
+  void CheckWhite(DataType data_type) {
+    // Gray pixel should have hue = saturation = 0.0, value = 1.0
+    AddInputFromArray<T>(TensorShape({3}), {0.0, 0.0, 1.0});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {1, 1, 1});
+    test::ExpectTensorEqual<T>(expected, *GetOutput(0));
+  }
+
+  void CheckRedMax(DataType data_type) {
+    // Test case where red channel dominates
+    T expected_h = 1. / 6. * .2 / .6;
+    T expected_s = .6 / .8;
+    T expected_v = .8 / 1.;
+
+    AddInputFromArray<T>(TensorShape({3}),
+                         {expected_h, expected_s, expected_v});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {.8, .4, .2});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
+
+  void CheckGreenMax(DataType data_type) {
+    // Test case where green channel dominates
+    T expected_h = 1. / 6. * (2.0 + (.2 / .6));
+    T expected_s = .6 / .8;
+    T expected_v = .8 / 1.;
+
+    AddInputFromArray<T>(TensorShape({3}),
+                         {expected_h, expected_s, expected_v});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {.2, .8, .4});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
+
+  void CheckBlueMax(DataType data_type) {
+    // Test case where blue channel dominates
+    T expected_h = 1. / 6. * (4.0 + (.2 / .6));
+    T expected_s = .6 / .8;
+    T expected_v = .8 / 1.0;
+
+    AddInputFromArray<T>(TensorShape({3}),
+                         {expected_h, expected_s, expected_v});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {.4, .2, .8});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
+
+  void CheckNegativeDifference(DataType data_type) {
+    T expected_h = 1. / 6. * (4.0 + (-.1 / .2));
+    T expected_s = .2 / .2;
+    T expected_v = .2 / 1.;
+
+    AddInputFromArray<T>(TensorShape({3}),
+                         {expected_h, expected_s, expected_v});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected(allocator(), data_type, TensorShape({3}));
+    test::FillValues<T>(&expected, {0, .1, .2});
+    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
+  }
 };

-TEST_F(HSVToRGBOpTest, CheckBlack) {
-  // Black pixel should map to rgb = [0,0,0]
-  AddInputFromArray<float>(TensorShape({3}), {0.0, 0.0, 0.0});
-  TF_ASSERT_OK(RunOpKernel());
+#define TEST_COLORSPACE(test, dt)                               \
+  TEST_F(test, CheckBlack) {                                    \
+    MakeOp(dt);                                                 \
+    CheckBlack(dt);                                             \
+  }                                                             \
+  TEST_F(test, CheckGray) {                                     \
+    MakeOp(dt);                                                 \
+    CheckGray(dt);                                              \
+  }                                                             \
+  TEST_F(test, CheckWhite) {                                    \
+    MakeOp(dt);                                                 \
+    CheckWhite(dt);                                             \
+  }                                                             \
+  TEST_F(test, CheckRedMax) {                                   \
+    MakeOp(dt);                                                 \
+    CheckRedMax(dt);                                            \
+  }                                                             \
+  TEST_F(test, CheckGreenMax) {                                 \
+    MakeOp(dt);                                                 \
+    CheckGreenMax(dt);                                          \
+  }                                                             \
+  TEST_F(test, CheckBlueMax) {                                  \
+    MakeOp(dt);                                                 \
+    CheckBlueMax(dt);                                           \
+  }                                                             \
+  TEST_F(test, CheckNegativeDifference) {                       \
+    MakeOp(dt);                                                 \
+    CheckNegativeDifference(dt);                                \
+  }

-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {0, 0, 0});
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
-}
+typedef RGBToHSVOpTest<float> rgb_to_hsv_float;
+typedef RGBToHSVOpTest<double> rgb_to_hsv_double;

-TEST_F(HSVToRGBOpTest, CheckGray) {
-  // Gray pixel should have hue = saturation = 0.0, value = r/255
-  AddInputFromArray<float>(TensorShape({3}), {0.0, 0.0, .5});
-  TF_ASSERT_OK(RunOpKernel());
+TEST_COLORSPACE(rgb_to_hsv_float, DT_FLOAT);
+TEST_COLORSPACE(rgb_to_hsv_double, DT_DOUBLE);

-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {.5, .5, .5});
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
-}
+typedef HSVToRGBOpTest<float> hsv_to_rgb_float;
+typedef HSVToRGBOpTest<double> hsv_to_rgb_double;

-TEST_F(HSVToRGBOpTest, CheckWhite) {
-  // Gray pixel should have hue = saturation = 0.0, value = 1.0
-  AddInputFromArray<float>(TensorShape({3}), {0.0, 0.0, 1.0});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {1, 1, 1});
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
-}
-
-TEST_F(HSVToRGBOpTest, CheckRedMax) {
-  // Test case where red channel dominates
-  float expected_h = 1. / 6. * .2 / .6;
-  float expected_s = .6 / .8;
-  float expected_v = .8 / 1.;
-
-  AddInputFromArray<float>(TensorShape({3}),
-                           {expected_h, expected_s, expected_v});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {.8, .4, .2});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
-}
-
-TEST_F(HSVToRGBOpTest, CheckGreenMax) {
-  // Test case where green channel dominates
-  float expected_h = 1. / 6. * (2.0 + (.2 / .6));
-  float expected_s = .6 / .8;
-  float expected_v = .8 / 1.;
-
-  AddInputFromArray<float>(TensorShape({3}),
-                           {expected_h, expected_s, expected_v});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {.2, .8, .4});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
-}
-
-TEST_F(HSVToRGBOpTest, CheckBlueMax) {
-  // Test case where blue channel dominates
-  float expected_h = 1. / 6. * (4.0 + (.2 / .6));
-  float expected_s = .6 / .8;
-  float expected_v = .8 / 1.0;
-
-  AddInputFromArray<float>(TensorShape({3}),
-                           {expected_h, expected_s, expected_v});
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({3}));
-  test::FillValues<float>(&expected, {.4, .2, .8});
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-6);
-}
+TEST_COLORSPACE(hsv_to_rgb_float, DT_FLOAT);
+TEST_COLORSPACE(hsv_to_rgb_double, DT_DOUBLE);
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@ -65,7 +65,7 @@ class BinaryOpShared : public OpKernel {

 // Coefficient-wise binary operations:
 //   Device: E.g., CPUDevice, GPUDevice.
-//   Functor: defined in cwise_functors.h. E.g., functor::add2.
+//   Functor: defined in cwise_ops.h. E.g., functor::add.
 template <typename Device, typename Functor>
 class BinaryOp : public BinaryOpShared {
 public:
@ -162,7 +162,7 @@ class SimpleBinaryOp : public OpKernel {

 // Coefficient-wise unary operations:
 //   Device: E.g., CPUDevice, GPUDevice.
-//   Functor: defined in cwise_functors.h. E.g., functor::sqrt.
+//   Functor: defined in cwise_ops.h. E.g., functor::sqrt.
 template <typename Device, typename Functor>
 class UnaryOp : public OpKernel {
 public:
--- a/tensorflow/core/kernels/decode_gif_op.cc
+++ b/tensorflow/core/kernels/decode_gif_op.cc
@ -0,0 +1,66 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gif/gif_io.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+// Decode the contents of a GIF file
+class DecodeGifOp : public OpKernel {
+ public:
+  explicit DecodeGifOp(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {
+    const Tensor& contents = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
+                errors::InvalidArgument("contents must be scalar, got shape ",
+                                        contents.shape().DebugString()));
+
+    // Start decoding image to get shape details
+    const StringPiece input = contents.scalar<string>()();
+
+    // Decode image, allocating tensor once the image size is known
+    Tensor* output = nullptr;
+    OP_REQUIRES(
+        context,
+        gif::Decode(input.data(), input.size(),
+                    [=, &output](int num_frames, int width, int height,
+                                 int channels) -> uint8* {
+                      Status status(context->allocate_output(
+                          0, TensorShape({num_frames, height, width, channels}),
+                          &output));
+                      if (!status.ok()) {
+                        VLOG(1) << status;
+                        context->SetStatus(status);
+                        return nullptr;
+                      }
+                      return output->flat<uint8>().data();
+                    }),
+        errors::InvalidArgument("Invalid GIF data, size ", input.size()));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("DecodeGif").Device(DEVICE_CPU), DecodeGifOp);
+
+}  // namespace tensorflow
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@ -97,13 +97,7 @@ class ReverseOp : public OpKernel {
                              .HostMemory("dims"),    \
                          ReverseOp<CPUDevice, T>)

-TF_CALL_uint8(REGISTER_KERNEL);
-TF_CALL_int8(REGISTER_KERNEL);
-TF_CALL_int32(REGISTER_KERNEL);
-TF_CALL_bool(REGISTER_KERNEL);
-TF_CALL_half(REGISTER_KERNEL);
-TF_CALL_float(REGISTER_KERNEL);
-TF_CALL_double(REGISTER_KERNEL);
+TF_CALL_POD_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL

 #if GOOGLE_CUDA
@ -136,6 +130,8 @@ TF_CALL_bool(DECLARE_GPU_SPEC);
 TF_CALL_half(DECLARE_GPU_SPEC);
 TF_CALL_float(DECLARE_GPU_SPEC);
 TF_CALL_double(DECLARE_GPU_SPEC);
+TF_CALL_complex64(DECLARE_GPU_SPEC);
+TF_CALL_complex128(DECLARE_GPU_SPEC);
 #undef DECLARE_GPU_SPEC
 #undef DECLARE_GPU_SPEC_DIM
 }  // namespace functor
@ -149,9 +145,15 @@ TF_CALL_double(DECLARE_GPU_SPEC);
                          ReverseOp<GPUDevice, T>)
 TF_CALL_uint8(REGISTER_GPU_KERNEL);
 TF_CALL_int8(REGISTER_GPU_KERNEL);
+// TODO Find out why the int32 GPU kernel doesn't work
+// and decide whether we want to enable the bool kernel.
+//TF_CALL_int32(REGISTER_GPU_KERNEL);
+//TF_CALL_bool(REGISTER_GPU_KERNEL);
 TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
 TF_CALL_double(REGISTER_GPU_KERNEL);
+TF_CALL_complex64(REGISTER_GPU_KERNEL);
+TF_CALL_complex128(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL

 #endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/reverse_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
@ -25,24 +25,30 @@ namespace tensorflow {

 typedef Eigen::GpuDevice GPUDevice;

-#define DEFINE_REVERSE(DIM)                                      \
-  template struct functor::Reverse<GPUDevice, uint8, DIM>;       \
-  template struct functor::Reverse<GPUDevice, int8, DIM>;        \
-  template struct functor::Reverse<GPUDevice, int32, DIM>;       \
-  template struct functor::Reverse<GPUDevice, bool, DIM>;        \
-  template struct functor::Reverse<GPUDevice, Eigen::half, DIM>; \
-  template struct functor::Reverse<GPUDevice, float, DIM>;       \
-  template struct functor::Reverse<GPUDevice, double, DIM>;
-DEFINE_REVERSE(0)
-DEFINE_REVERSE(1)
-DEFINE_REVERSE(2)
-DEFINE_REVERSE(3)
-DEFINE_REVERSE(4)
-DEFINE_REVERSE(5)
-DEFINE_REVERSE(6)
-DEFINE_REVERSE(7)
-DEFINE_REVERSE(8)
+#define DEFINE_REVERSE(T, DIM) \
+  template struct functor::Reverse<GPUDevice, T, DIM>;
+#define DEFINE_REVERSE_ALL_DIMS(T) \
+  DEFINE_REVERSE(T, 0) \
+  DEFINE_REVERSE(T, 1) \
+  DEFINE_REVERSE(T, 2) \
+  DEFINE_REVERSE(T, 3) \
+  DEFINE_REVERSE(T, 4) \
+  DEFINE_REVERSE(T, 5) \
+  DEFINE_REVERSE(T, 6) \
+  DEFINE_REVERSE(T, 7) \
+  DEFINE_REVERSE(T, 8)
+
+TF_CALL_uint8(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_int8(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_int32(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_bool(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_half(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_float(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_double(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_complex64(DEFINE_REVERSE_ALL_DIMS);
+TF_CALL_complex128(DEFINE_REVERSE_ALL_DIMS);
 #undef DEFINE_REVERSE
+#undef DEFINE_REVERSE_ALL_DIMS

 }  // namespace tensorflow

--- a/tensorflow/core/kernels/scan_ops.cc
+++ b/tensorflow/core/kernels/scan_ops.cc
@ -0,0 +1,177 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/kernels/scan_ops.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, class T, typename Reducer>
+class ScanOp : public OpKernel {
+public:
+  explicit ScanOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("reverse", &reverse_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("exclusive", &exclusive_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input = ctx->input(0);
+    const Tensor& tensor_axis = ctx->input(1);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_axis.shape()),
+                errors::InvalidArgument("ScanOp: axis must be a scalar, not ",
+                                        tensor_axis.shape().DebugString()));
+
+    const int axis = internal::SubtleMustCopy(tensor_axis.scalar<int>()());
+
+    OP_REQUIRES(
+        ctx, FastBoundsCheck(axis, input.dims()),
+        errors::InvalidArgument("ScanOp: Expected scan axis in the range [", 0,
+                                ", ", input.dims(), "), but got ", axis));
+
+    TensorShape output_shape = input.shape();
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output));
+
+    const Device& d = ctx->eigen_device<Device>();
+    Reducer reducer;
+
+#define HANDLE_SCAN(NDIMS)                                                \
+  case NDIMS:                                                             \
+    functor::Scan<Device, Reducer, T, NDIMS>()(                           \
+        d, input.tensor<T, NDIMS>(), output->tensor<T, NDIMS>(), reducer, \
+        axis, reverse_, exclusive_);                                      \
+    return;
+
+    switch (input.dims()) {
+      // input.dims() == 0 can't occur as there
+      // is no valid axis parameter in this case
+      HANDLE_SCAN(1);
+      HANDLE_SCAN(2);
+      HANDLE_SCAN(3);
+      HANDLE_SCAN(4);
+      HANDLE_SCAN(5);
+      HANDLE_SCAN(6);
+      HANDLE_SCAN(7);
+      HANDLE_SCAN(8);
+      default:
+        OP_REQUIRES(ctx, false, errors::InvalidArgument(
+                                    "Scan does not support tensors with "
+                                    "more than 8 dimensions",
+                                    input.dims()));
+    }
+#undef HANDLE_SCAN
+  }
+
+private:
+  bool reverse_;
+  bool exclusive_;
+};
+
+#ifdef GOOGLE_CUDA
+namespace functor {
+
+// Forward declarations of GPU functors
+#define DECLARE(REDUCER, T, D)                                             \
+  template <>                                                              \
+  void Scan<GPUDevice, REDUCER, T, D>::operator()(                         \
+      const GPUDevice& d, TTypes<T, D>::ConstTensor in,                    \
+      TTypes<T, D>::Tensor out, const REDUCER& reducer,                    \
+      const Eigen::Index& axis, const bool reverse, const bool exclusive); \
+  extern template struct Scan<GPUDevice, REDUCER, T, D>;
+
+#define DECLARE_FOR_ALL_DIMS(REDUCER, T) \
+  DECLARE(REDUCER, T, 1);                \
+  DECLARE(REDUCER, T, 2);                \
+  DECLARE(REDUCER, T, 3);                \
+  DECLARE(REDUCER, T, 4);                \
+  DECLARE(REDUCER, T, 5);                \
+  DECLARE(REDUCER, T, 6);                \
+  DECLARE(REDUCER, T, 7);                \
+  DECLARE(REDUCER, T, 8);
+
+#define DECLARE_FOR_ALL_REDUCERS(T)                        \
+  DECLARE_FOR_ALL_DIMS(Eigen::internal::SumReducer<T>, T); \
+  DECLARE_FOR_ALL_DIMS(Eigen::internal::ProdReducer<T>, T);
+
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_ALL_REDUCERS);
+
+#undef DECLARE_FOR_ALL_REDUCERS
+#undef DECLARE_FOR_ALL_DIMS
+#undef DECLARE
+
+}  // namespace functor
+#endif  // GOOGLE_CUDA
+
+
+// Register Cumsum kernels
+#define REGISTER_CPU_KERNELS(type)                                 \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("Cumsum").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ScanOp<CPUDevice, type, Eigen::internal::SumReducer<type>>)
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+#define REGISTER_GPU_KERNELS(type)   \
+  REGISTER_KERNEL_BUILDER(           \
+      Name("Cumsum")                 \
+          .Device(DEVICE_GPU)        \
+          .TypeConstraint<type>("T") \
+          .HostMemory("axis"),       \
+      ScanOp<GPUDevice, type, Eigen::internal::SumReducer<type>>)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS)
+#undef REGISTER_GPU_KERNELS
+#endif // GOOGLE_CUDA
+
+
+// Register Cumprod kernels
+#define REGISTER_CPU_KERNELS(type)                                  \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("Cumprod").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ScanOp<CPUDevice, type, Eigen::internal::ProdReducer<type>>)
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+#define REGISTER_GPU_KERNELS(type)   \
+  REGISTER_KERNEL_BUILDER(           \
+      Name("Cumprod")                \
+          .Device(DEVICE_GPU)        \
+          .TypeConstraint<type>("T") \
+          .HostMemory("axis"),       \
+      ScanOp<GPUDevice, type, Eigen::internal::ProdReducer<type>>)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS)
+#undef REGISTER_GPU_KERNELS
+#endif // GOOGLE_CUDA
+
+}  // namespace tensorflow
--- a/tensorflow/core/kernels/scan_ops.h
+++ b/tensorflow/core/kernels/scan_ops.h
@ -0,0 +1,47 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_SCAN_OPS_H_
+#define TENSORFLOW_KERNELS_SCAN_OPS_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::Index Index;
+
+template <typename Device, typename Reducer, typename T, int Dims>
+struct Scan {
+  void operator()(const Device& d, typename TTypes<T, Dims>::ConstTensor in,
+                  typename TTypes<T, Dims>::Tensor out, const Reducer& reducer,
+                  const Index& axis, const bool reverse, const bool exclusive) {
+    // Perform the reverse ops directly with Eigen, which avoids copying the
+    // tensor twice compared to using individual ops.
+    Eigen::array<bool, Dims> dims;
+    for (int i = 0; i < dims.size(); i++) {
+      dims[i] = reverse && (i == axis);
+    }
+    To32Bit(out).device(d) = To32Bit(in).reverse(dims)
+                                        .scan(axis, reducer, exclusive)
+                                        .reverse(dims);
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_SCAN_OPS_H_
--- a/tensorflow/core/kernels/scan_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/scan_ops_gpu.cu.cc
@ -0,0 +1,54 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/register_types.h"
+
+#include "tensorflow/core/kernels/scan_ops.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+typedef Eigen::Index Index;
+
+#define DEFINE(REDUCER, T, D) \
+  template struct functor::Scan<GPUDevice, REDUCER, T, D>;
+
+#define DEFINE_FOR_ALL_DIMS(REDUCER, T) \
+  DEFINE(REDUCER, T, 1);                \
+  DEFINE(REDUCER, T, 2);                \
+  DEFINE(REDUCER, T, 3);                \
+  DEFINE(REDUCER, T, 4);                \
+  DEFINE(REDUCER, T, 5);                \
+  DEFINE(REDUCER, T, 6);                \
+  DEFINE(REDUCER, T, 7);                \
+  DEFINE(REDUCER, T, 8)
+
+#define DEFINE_FOR_ALL_REDUCERS(T)                        \
+  DEFINE_FOR_ALL_DIMS(Eigen::internal::SumReducer<T>, T); \
+  DEFINE_FOR_ALL_DIMS(Eigen::internal::ProdReducer<T>, T);
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_FOR_ALL_REDUCERS);
+#undef DEFINE_FOR_ALL_REDUCERS
+#undef DEFINE_FOR_ALL_DIMS
+#undef DEFINE
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@ -59,8 +59,7 @@ struct ApplyAdadelta<CPUDevice, T> {
    accum.device(d) =
        accum * rho() + grad.square() * (static_cast<T>(1) - rho());
    const auto update =
-	(accum_update + epsilon()).sqrt() *
-	(accum + epsilon()).rsqrt() * grad;
+        (accum_update + epsilon()).sqrt() * (accum + epsilon()).rsqrt() * grad;
    accum_update.device(d) =
        accum_update * rho() + update.square() * (static_cast<T>(1) - rho());
    var.device(d) -= update * lr();
@ -176,9 +175,13 @@ struct ApplyMomentum<CPUDevice, T> {
                  typename TTypes<T>::Flat accum,
                  typename TTypes<T>::ConstScalar lr,
                  typename TTypes<T>::ConstFlat grad,
-                  typename TTypes<T>::ConstScalar momentum) {
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov) {
    accum.device(d) = accum * momentum() + grad;
-    var.device(d) -= accum * lr();
+    if (use_nesterov) {
+      var.device(d) -= grad * lr() + accum * momentum() * lr();
+    } else {
+      var.device(d) -= accum * lr();
+    }
  }
 };

@ -1515,6 +1518,7 @@ class ApplyMomentumOp : public OpKernel {
 public:
  explicit ApplyMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
  }

  void Compute(OpKernelContext* ctx) override {
@ -1554,12 +1558,13 @@ class ApplyMomentumOp : public OpKernel {
    const Device& device = ctx->template eigen_device<Device>();
    functor::ApplyMomentum<Device, T>()(device, var.flat<T>(), accum.flat<T>(),
                                        lr.scalar<T>(), grad.flat<T>(),
-                                        momentum.scalar<T>());
+                                        momentum.scalar<T>(), use_nesterov_);
    ctx->forward_ref_input_to_ref_output(0, 0);
  }

 private:
  bool use_exclusive_lock_;
+  bool use_nesterov_;
 };

 typedef Eigen::ThreadPoolDevice CPUDevice;
@ -1584,7 +1589,7 @@ namespace functor {
      const GPUDevice& d, typename TTypes<T>::Flat var,                   \
      typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \
      typename TTypes<T>::ConstFlat grad,                                 \
-      typename TTypes<T>::ConstScalar momentum);                          \
+      typename TTypes<T>::ConstScalar momentum, bool use_nesterov);       \
  extern template struct ApplyMomentum<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@ -1605,6 +1610,7 @@ class SparseApplyMomentumOp : public OpKernel {
 public:
  explicit SparseApplyMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
  }

  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
@ -1672,7 +1678,12 @@ class SparseApplyMomentumOp : public OpKernel {
        auto g = grad_flat.template chip<0>(i);
        auto v = var_flat.template chip<0>(index);
        a = a * a.constant(momentum_scalar) + g;
-        v -= a.constant(lr_scalar) * a;
+        if (use_nesterov_) {
+          v -= g.constant(lr_scalar) * g +
+               a.constant(lr_scalar) * a.constant(momentum_scalar) * a;
+        } else {
+          v -= a.constant(lr_scalar) * a;
+        }
      }
    }

@ -1681,6 +1692,7 @@ class SparseApplyMomentumOp : public OpKernel {

 private:
  bool use_exclusive_lock_;
+  bool use_nesterov_;
 };

 #define REGISTER_KERNELS(T, Tindices)                                \
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_KERNELS_TRAINING_OPS_H_
 #define TENSORFLOW_KERNELS_TRAINING_OPS_H_

-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"

 namespace tensorflow {
 namespace functor {
@ -98,7 +98,7 @@ struct ApplyMomentum {
                  typename TTypes<T>::Flat accum,
                  typename TTypes<T>::ConstScalar lr,
                  typename TTypes<T>::ConstFlat grad,
-                  typename TTypes<T>::ConstScalar momentum);
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov);
 };

 template <typename Device, typename T>
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@ -17,8 +17,8 @@ limitations under the License.

 #define EIGEN_USE_GPU

-#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/training_ops.h"
+#include "tensorflow/core/framework/register_types.h"

 namespace tensorflow {

@ -84,12 +84,18 @@ struct ApplyMomentum<GPUDevice, T> {
                  typename TTypes<T>::Flat accum,
                  typename TTypes<T>::ConstScalar lr,
                  typename TTypes<T>::ConstFlat grad,
-                  typename TTypes<T>::ConstScalar momentum) {
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov) {
    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
    bcast[0] = grad.dimension(0);
    Eigen::Sizes<1> single;
    accum.device(d) = accum * momentum.reshape(single).broadcast(bcast) + grad;
-    var.device(d) -= lr.reshape(single).broadcast(bcast) * accum;
+    if (use_nesterov) {
+      var.device(d) -= grad * lr.reshape(single).broadcast(bcast) +
+                       accum * momentum.reshape(single).broadcast(bcast) *
+                           lr.reshape(single).broadcast(bcast);
+    } else {
+      var.device(d) -= lr.reshape(single).broadcast(bcast) * accum;
+    }
  }
 };

--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@ -0,0 +1,95 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Functions to read images in GIF format.
+
+#include "tensorflow/core/lib/gif/gif_io.h"
+#include "tensorflow/core/platform/gif.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gif {
+
+int input_callback(GifFileType* gif_file, GifByteType* buf, int size) {
+  if (gif_file->UserData && memcpy(buf, gif_file->UserData, size)) {
+    gif_file->UserData = ((uint8_t*)gif_file->UserData) + size;
+    return size;
+  }
+  return 0;
+}
+
+uint8* Decode(const void* srcdata, int datasize,
+              std::function<uint8*(int, int, int, int)> allocate_output) {
+  int error_code = D_GIF_SUCCEEDED;
+  GifFileType* gif_file =
+      DGifOpen(const_cast<void*>(srcdata), &input_callback, &error_code);
+  if (error_code != D_GIF_SUCCEEDED) {
+    LOG(ERROR) << "Fail to open gif file, reason: "
+               << GifErrorString(error_code);
+    return nullptr;
+  }
+  if (DGifSlurp(gif_file) != GIF_OK) {
+    LOG(ERROR) << "Fail to slurp gif file, reason: "
+               << GifErrorString(gif_file->Error);
+    return nullptr;
+  }
+  if (gif_file->ImageCount <= 0) {
+    LOG(ERROR) << "Gif file does not contain any image";
+    return nullptr;
+  }
+
+  int num_frames = gif_file->ImageCount;
+  int width = gif_file->SWidth;
+  int height = gif_file->SHeight;
+  int channel = 3;
+
+  uint8* dstdata = allocate_output(num_frames, width, height, channel);
+  for (int k = 0; k < num_frames; k++) {
+    SavedImage* this_image = &gif_file->SavedImages[k];
+    GifImageDesc* img_desc = &this_image->ImageDesc;
+    if (img_desc->Left != 0 || img_desc->Top != 0 || img_desc->Width != width ||
+        img_desc->Height != height) {
+      LOG(ERROR) << "Can't process optimized gif.";
+      return nullptr;
+    }
+
+    ColorMapObject* color_map = this_image->ImageDesc.ColorMap
+                                    ? this_image->ImageDesc.ColorMap
+                                    : gif_file->SColorMap;
+
+    uint8* this_dst = dstdata + k * width * channel * height;
+    for (int i = 0; i < height; ++i) {
+      uint8* p_dst = this_dst + i * width * channel;
+      for (int j = 0; j < width; ++j) {
+        GifByteType color_index = this_image->RasterBits[i * width + j];
+        const GifColorType& gif_color = color_map->Colors[color_index];
+        p_dst[j * channel + 0] = gif_color.Red;
+        p_dst[j * channel + 1] = gif_color.Green;
+        p_dst[j * channel + 2] = gif_color.Blue;
+      }
+    }
+  }
+
+  if (DGifCloseFile(gif_file, &error_code) != GIF_OK) {
+    LOG(WARNING) << "Fail to close gif file, reason: "
+                 << GifErrorString(error_code);
+  }
+  return dstdata;
+}
+
+}  // namespace gif
+}  // namespace tensorflow
--- a/tensorflow/core/lib/gif/gif_io.h
+++ b/tensorflow/core/lib/gif/gif_io.h
@ -0,0 +1,51 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Functions to read and write images in GIF format.
+//
+// The advantage over image/codec/png{enc,dec}ocder.h is that this library
+// supports both 8 and 16 bit images.
+//
+// The decoding routine accepts binary image data as a StringPiece.  These are
+// implicitly constructed from strings or char* so they're completely
+// transparent to the caller.  They're also very cheap to construct so this
+// doesn't introduce any additional overhead.
+//
+// The primary benefit of StringPieces being, in this case, that APIs already
+// returning StringPieces (e.g., Bigtable Scanner) or Cords (e.g., IOBuffer;
+// only when they're flat, though) or protocol buffer fields typed to either of
+// these can be decoded without copying the data into a C++ string.
+
+#ifndef TENSORFLOW_CORE_LIB_GIF_GIF_IO_H_
+#define TENSORFLOW_CORE_LIB_GIF_GIF_IO_H_
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gif {
+
+uint8* Decode(const void* srcdata, int datasize,
+              std::function<uint8*(int, int, int, int)> allocate_output);
+
+}  // namespace gif
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_GIF_GIF_IO_H_
--- a/tensorflow/core/lib/gif/testdata/optimized.gif
+++ b/tensorflow/core/lib/gif/testdata/optimized.gif
--- a/tensorflow/core/lib/gif/testdata/scan.gif
+++ b/tensorflow/core/lib/gif/testdata/scan.gif
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@ -739,7 +739,7 @@ REGISTER_OP("Reverse")
    .Input("tensor: T")
    .Input("dims: bool")
    .Output("output: T")
-    .Attr("T: {uint8, int8, int32, bool, half, float, double}")
+    .Attr("T: {uint8, int8, int32, bool, half, float, double, complex64, complex128}")
    .SetShapeFn([](InferenceContext* c) {
      const Shape* input = c->input(0);
      const Shape* dims;
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@ -440,10 +440,27 @@ compression: Compression level.
 contents: 0-D. PNG-encoded image.
 )doc");

+// --------------------------------------------------------------------------
+REGISTER_OP("DecodeGif")
+    .Input("contents: string")
+    .Output("image: uint8")
+    .Doc(R"doc(
+Decode the first frame of a GIF-encoded image to a uint8 tensor.
+
+GIF with frame or transparency compression are not supported
+convert animated GIF from compressed to uncompressed by:
+
+convert $src.gif -coalesce $dst.gif
+
+contents: 0-D.  The GIF-encoded image.
+image: 4-D with shape `[num_frames, height, width, 3]`. RGB order
+)doc");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("RGBToHSV")
-    .Input("images: float")
-    .Output("output: float")
+    .Input("images: T")
+    .Output("output: T")
+    .Attr("T: {float, double} = DT_FLOAT")
    .SetShapeFn(ColorspaceShapeFn)
    .Doc(R"doc(
 Converts one or more images from RGB to HSV.
@ -462,8 +479,9 @@ output: `images` converted to HSV.

 // --------------------------------------------------------------------------
 REGISTER_OP("HSVToRGB")
-    .Input("images: float")
-    .Output("output: float")
+    .Input("images: T")
+    .Output("output: T")
+    .Attr("T: {float, double} = DT_FLOAT")
    .SetShapeFn(ColorspaceShapeFn)
    .Doc(R"doc(
 Convert one or more images from HSV to RGB.
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@ -1831,4 +1831,76 @@ b: Another tensor, of same type and shape as `a`.
 product: Pairwise cross product of the vectors in `a` and `b`.
 )doc");

+// --------------------------------------------------------------------------
+
+REGISTER_OP("Cumsum")
+    .Input("x: T")
+    .Input("axis: int32")
+    .Attr("exclusive: bool = false")
+    .Attr("reverse: bool = false")
+    .Output("out: T")
+    .Attr("T: numbertype")
+    .Doc(R"doc(
+Compute the cumulative sum of the tensor `x` along `axis`.
+
+By default, this op performs an inclusive cumsum, which means that the first
+element of the input is identical to the first element of the output:
+```prettyprint
+tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
+```
+
+By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+performed instead:
+```prettyprint
+tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
+```
+
+By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+opposite direction:
+```prettyprint
+tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
+```
+This is more efficient than using separate `tf.reverse` ops.
+
+The `reverse` and `exclusive` kwargs can also be combined:
+```prettyprint
+tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
+```
+)doc");
+
+REGISTER_OP("Cumprod")
+    .Input("x: T")
+    .Input("axis: int32")
+    .Attr("exclusive: bool = false")
+    .Attr("reverse: bool = false")
+    .Output("out: T")
+    .Attr("T: numbertype")
+    .Doc(R"doc(
+Compute the cumulative product of the tensor `x` along `axis`.
+
+By default, this op performs an inclusive cumprod, which means that the first
+element of the input is identical to the first element of the output:
+```prettyprint
+tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
+```
+
+By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+performed instead:
+```prettyprint
+tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
+```
+
+By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+opposite direction:
+```prettyprint
+tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
+```
+This is more efficient than using separate `tf.reverse` ops.
+
+The `reverse` and `exclusive` kwargs can also be combined:
+```prettyprint
+tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
+```
+)doc");
+
 }  // namespace tensorflow
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@ -4342,6 +4342,42 @@ op {
  summary: "Decode a PNG-encoded image to a uint8 or uint16 tensor."
  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the PNG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the PNG-encoded image is transformed to match the requested number\nof color channels."
 }
+op {
+  name: "DecodeGif"
+  input_arg {
+    name: "contents"
+    description: "0-D.  The GIF-encoded image."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    description: "3-D with shape `[height, width, channels]`."
+    type_attr: "dtype"
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "Number of color channels for the decoded image."
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  summary: "Decode a GIF-encoded image to a uint8 or uint16 tensor."
+  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the GIF-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the GIF-encoded image is transformed to match the requested number\nof color channels."
+}
 op {
  name: "DecodeRaw"
  input_arg {
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@ -488,11 +488,13 @@ REGISTER_OP("ApplyMomentum")
    .Output("out: Ref(T)")
    .Attr("T: numbertype")
    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
    .SetShapeFn([](InferenceContext* c) {
      return ApplyMomentumShapeFn(c, false /* sparse */);
    })
    .Doc(R"doc(
-Update '*var' according to the momentum scheme.
+Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+want to use Nesterov momentum.

 accum = accum * momentum + grad
 var -= lr * accum
@ -506,6 +508,9 @@ out: Same as "var".
 use_locking: If `True`, updating of the var and accum tensors will be protected
  by a lock; otherwise the behavior is undefined, but may exhibit less
  contention.
+use_nesterov: If `True`, the tensor passed to compute grad will be 
+var - lr * momentum * accum, so in the end, the var you get is actually
+var - lr * momentum * accum.
 )doc");

 REGISTER_OP("SparseApplyMomentum")
@ -519,11 +524,13 @@ REGISTER_OP("SparseApplyMomentum")
    .Attr("T: numbertype")
    .Attr("Tindices: {int32, int64}")
    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
    .SetShapeFn([](InferenceContext* c) {
      return ApplyMomentumShapeFn(c, true /* sparse */);
    })
    .Doc(R"doc(
 Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+Set use_nesterov = True if you want to use Nesterov momentum.

 That is for rows we have grad for, we update var and accum as follows:

@ -540,6 +547,9 @@ out: Same as "var".
 use_locking: If `True`, updating of the var and accum tensors will be protected
  by a lock; otherwise the behavior is undefined, but may exhibit less
  contention.
+use_nesterov: If `True`, the tensor passed to compute grad will be 
+var - lr * momentum * accum, so in the end, the var you get is actually
+var - lr * momentum * accum.
 )doc");

 static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) {
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@ -57,12 +57,13 @@ cc_library(
    name = "platformlib",
    copts = tf_copts(),
    deps = [
+        "//tensorflow/core:protos_cc",
        "@farmhash_archive//:farmhash",
+        "@gif_archive//:gif",
+        "@highwayhash//:sip_hash",
        "@jpeg_archive//:jpeg",
        "@png_archive//:png",
-        "@highwayhash//:sip_hash",
        "@re2//:re2",
-        "//tensorflow/core:protos_cc",
    ],
 )

--- a/tensorflow/core/platform/gif.h
+++ b/tensorflow/core/platform/gif.h
@ -0,0 +1,29 @@
+/* Copyright 2015 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_GIF_H_
+#define TENSORFLOW_CORE_PLATFORM_GIF_H_
+
+#include "tensorflow/core/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE)
+#include "tensorflow/core/platform/google/build_config/gif.h"
+#elif defined(PLATFORM_POSIX) && !defined(IS_MOBILE_PLATFORM)
+#include "giflib-5.1.4/lib/gif_lib.h"
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_GIF_H_
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
@ -94,8 +94,8 @@ def run_training():
    saver = tf.train.Saver()

    # Create the op for initializing variables.
-    init_op = tf.initialize_all_variables()
-
+    init_op = tf.group(tf.initialize_all_variables(),
+                       tf.initialize_local_variables())
    # Create a session for running Ops on the Graph.
    sess = tf.Session()

--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@ -99,8 +99,10 @@ Status ReadTensorFromImageFile(string file_name, const int input_height,
  if (tensorflow::StringPiece(file_name).ends_with(".png")) {
    image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                             DecodePng::Channels(wanted_channels));
+  } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
+    image_reader = DecodeGif(root.WithOpName("gif_reader"), file_reader);
  } else {
-    // Assume if it's not a PNG then it must be a JPEG.
+    // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
    image_reader = DecodeJpeg(root.WithOpName("jpeg_reader"), file_reader,
                              DecodeJpeg::Channels(wanted_channels));
  }
--- a/tensorflow/examples/skflow/resnet.py
+++ b/tensorflow/examples/skflow/resnet.py
@ -52,13 +52,13 @@ def res_net(x, y, activation=tf.nn.relu):
    Predictions and loss tensors.
  """

-  # Configurations for each bottleneck block.
-  BottleneckBlock = namedtuple(
-      'BottleneckBlock', ['num_layers', 'num_filters', 'bottleneck_size'])
-  blocks = [BottleneckBlock(3, 128, 32),
-            BottleneckBlock(3, 256, 64),
-            BottleneckBlock(3, 512, 128),
-            BottleneckBlock(3, 1024, 256)]
+  # Configurations for each bottleneck group.
+  BottleneckGroup = namedtuple(
+      'BottleneckGroup', ['num_blocks', 'num_filters', 'bottleneck_size'])
+  groups = [BottleneckGroup(3, 128, 32),
+            BottleneckGroup(3, 256, 64),
+            BottleneckGroup(3, 512, 128),
+            BottleneckGroup(3, 1024, 256)]

  input_shape = x.get_shape().as_list()

@ -78,19 +78,19 @@ def res_net(x, y, activation=tf.nn.relu):

  # First chain of resnets
  with tf.variable_scope('conv_layer2'):
-    net = learn.ops.conv2d(net, blocks[0].num_filters,
+    net = learn.ops.conv2d(net, groups[0].num_filters,
                           [1, 1], [1, 1, 1, 1],
                           padding='VALID', bias=True)

-  # Create each bottleneck building block for each layer
-  for block_i, block in enumerate(blocks):
-    for layer_i in range(block.num_layers):
-
-      name = 'block_%d/layer_%d' % (block_i, layer_i)
+  # Create the bottleneck groups, each of which contains `num_blocks`
+  # bottleneck groups.
+  for group_i, group in enumerate(groups):
+    for block_i in range(group.num_blocks):
+      name = 'group_%d/block_%d' % (group_i, block_i)

      # 1x1 convolution responsible for reducing dimension
      with tf.variable_scope(name + '/conv_in'):
-        conv = learn.ops.conv2d(net, block.bottleneck_size,
+        conv = learn.ops.conv2d(net, group.bottleneck_size,
                                [1, 1], [1, 1, 1, 1],
                                padding='VALID',
                                activation=activation,
@ -98,7 +98,7 @@ def res_net(x, y, activation=tf.nn.relu):
                                bias=False)

      with tf.variable_scope(name + '/conv_bottleneck'):
-        conv = learn.ops.conv2d(conv, block.bottleneck_size,
+        conv = learn.ops.conv2d(conv, group.bottleneck_size,
                                [3, 3], [1, 1, 1, 1],
                                padding='SAME',
                                activation=activation,
@ -107,7 +107,8 @@ def res_net(x, y, activation=tf.nn.relu):

      # 1x1 convolution responsible for restoring dimension
      with tf.variable_scope(name + '/conv_out'):
-        conv = learn.ops.conv2d(conv, block.num_filters,
+        input_dim = net.get_shape()[-1].value
+        conv = learn.ops.conv2d(conv, input_dim,
                                [1, 1], [1, 1, 1, 1],
                                padding='VALID',
                                activation=activation,
@ -118,16 +119,16 @@ def res_net(x, y, activation=tf.nn.relu):
      # residual function (identity shortcut)
      net = conv + net

-      try:
-        # upscale to the next block size
-        next_block = blocks[block_i + 1]
-        with tf.variable_scope('block_%d/conv_upscale' % block_i):
-          net = learn.ops.conv2d(net, next_block.num_filters,
-                                 [1, 1], [1, 1, 1, 1],
-                                 bias=False,
-                                 padding='SAME')
-      except IndexError:
-        pass
+    try:
+      # upscale to the next group size
+      next_group = groups[group_i + 1]
+      with tf.variable_scope('block_%d/conv_upscale' % group_i):
+        net = learn.ops.conv2d(net, next_group.num_filters,
+                               [1, 1], [1, 1, 1, 1],
+                               bias=False,
+                               padding='SAME')
+    except IndexError:
+      pass

  net_shape = net.get_shape().as_list()
  net = tf.nn.avg_pool(net,
@ -139,18 +140,12 @@ def res_net(x, y, activation=tf.nn.relu):

  return learn.models.logistic_regression(net, y)

-
 # Download and load MNIST data.
 mnist = input_data.read_data_sets('MNIST_data')

 # Restore model if graph is saved into a folder.
 if os.path.exists('models/resnet/graph.pbtxt'):
  classifier = learn.TensorFlowEstimator.restore('models/resnet/')
-else:
-  # Create a new resnet classifier.
-  classifier = learn.TensorFlowEstimator(
-      model_fn=res_net, n_classes=10, batch_size=100, steps=100,
-      learning_rate=0.001, continue_training=True)

 while True:
  # Train model and save summaries into logdir.
@ -161,6 +156,3 @@ while True:
  score = metrics.accuracy_score(
      mnist.test.labels, classifier.predict(mnist.test.images, batch_size=64))
  print('Accuracy: {0:f}'.format(score))
-
-  # Save model graph and checkpoints.
-  classifier.save('models/resnet/')
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@ -49,7 +49,7 @@ def train():

  # Create a multilayer model.

-  # Input placehoolders
+  # Input placeholders
  with tf.name_scope('input'):
    x = tf.placeholder(tf.float32, [None, 784], name='x-input')
    y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
--- a/tensorflow/examples/udacity/README.md
+++ b/tensorflow/examples/udacity/README.md
@ -6,7 +6,11 @@ Course information can be found at https://www.udacity.com/course/deep-learning-
 Running the Docker container from the Google Cloud repository
 -------------------------------------------------------------

-    docker run -p 8888:8888 -it b.gcr.io/tensorflow-udacity/assignments:0.5.0
+    docker run -p 8888:8888 --name tensorflow-udacity -it b.gcr.io/tensorflow-udacity/assignments:0.5.0
+
+Note that if you ever exit the container, you can return to it using:
+
+    docker start -ai tensorflow-udacity

 Accessing the Notebooks
 -----------------------
@ -19,21 +23,6 @@ On mac, find the virtual machine's IP using:

 Then go to: http://IP:8888 (likely http://192.168.99.100:8888)

-Saving Your Progress
--------------------
-
-Because of the `--rm` flag above, stopping the docker container removes it, so any changes you've made will disappear. One way around this is to remove the `--rm` flag, and name the container for easy restarting:
-```sh
-# you only need to "run" the container the first time:
-docker run -p 8888:8888 -it --name tensorflow-udacity b.gcr.io/tensorflow-udacity/assignments:0.5.0
-# …do various things…
-# when you're done, control-C to kill jupyter and stop the container
-# when you're ready to do more things, you can now just "start" the container:
-docker start -ai tensorflow-udacity
-# …do more things…
-# …repeat…
-```
-
 FAQ
 ---

--- a/tensorflow/g3doc/get_started/os_setup.md
+++ b/tensorflow/g3doc/get_started/os_setup.md
@ -44,7 +44,7 @@ management system used to install and manage software packages written in
 Python.

 The packages that will be installed or upgraded during the pip install are listed in the
-[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py)
+[REQUIRED_PACKAGES section of setup.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/pip_package/setup.py).

 Install pip (or pip3 for python3) if it is not already installed:

@ -231,7 +231,7 @@ packages needed by TensorFlow.

 Install Anaconda:

-Follow the instructions on the [Anaconda download site](https://www.continuum.io/downloads)
+Follow the instructions on the [Anaconda download site](https://www.continuum.io/downloads).

 Create a conda environment called `tensorflow`:

@ -377,6 +377,8 @@ The option `-p 8888:8888` is used to publish the Docker container᾿s internal p

 The format of the port mapping is `hostPort:containerPort`. You can specify any valid port number for the host port but have to use `8888` for the container port portion.

+If you're using a container with GPU support, some additional flags must be passed to expose the GPU device to the container.
+
 For NVidia GPU support install latest NVidia drivers and
 [nvidia-docker](https://github.com/NVIDIA/nvidia-docker).
 Run with
@ -385,7 +387,15 @@ Run with
 $ nvidia-docker run -it -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu
 ```

-For more details see (TensorFlow docker readme)[https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/docker].
+If you have a problem running `nvidia-docker`, then using the default config, we include a
+[script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/docker_run_gpu.sh)
+in the repo with these flags, so the command-line would look like
+
+```bash
+$ path/to/repo/tensorflow/tools/docker/docker_run_gpu.sh -p 8888:8888 gcr.io/tensorflow/tensorflow:latest-gpu
+```
+
+For more details see [TensorFlow docker readme](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/docker).

 You can now [test your installation](#test-the-tensorflow-installation) within the Docker container.

@ -479,7 +489,7 @@ of tensorflow. If you want to install a specific branch (such as a release branc
 pass `-b <branchname>` to the `git clone` command and `--recurse-submodules` for
 r0.8 and earlier to fetch the protobuf library that TensorFlow depends on.

-### Installation for Linux
+### Prepare environment for Linux

 #### Install Bazel

@ -508,19 +518,6 @@ $ sudo apt-get install python-numpy swig python-dev python-wheel
 $ sudo apt-get install python3-numpy swig python3-dev python3-wheel
 ```

-#### Configure the installation
-
-Run the `configure` script at the root of the tree.  The configure script
-asks you for the path to your python interpreter and allows (optional)
-configuration of the CUDA libraries (see [below](#configure-tensorflows-canonical-view-of-cuda-libraries)).
-
-This step is used to locate the python and numpy header files.
-
-```bash
-$ ./configure
-Please specify the location of python. [Default is /usr/bin/python]:
-```
-
 #### Optional: Install CUDA (GPUs on Linux)

 In order to build or run TensorFlow with GPU support, both NVIDIA's Cuda Toolkit (>= 7.0) and
@ -564,83 +561,7 @@ sudo cp cuda/lib64/libcudnn* /usr/local/cuda/lib64
 sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
 ```

-##### Configure TensorFlow's canonical view of Cuda libraries
-
-When running the `configure` script from the root of your source tree, select
-the option `Y` when asked to build TensorFlow with GPU support. If you have
-several versions of Cuda or cuDNN installed, you should definitely select
-one explicitly instead of relying on the system default. You should see
-prompts like the following:
-
-``` bash
-$ ./configure
-Please specify the location of python. [Default is /usr/bin/python]:
-Do you wish to build TensorFlow with GPU support? [y/N] y
-GPU support will be enabled for TensorFlow
-
-Please specify which gcc nvcc should use as the host compiler. [Default is
-/usr/bin/gcc]: /usr/bin/gcc-4.9
-
-Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave
-empty to use system default]: 7.5
-
-Please specify the location where CUDA 7.5 toolkit is installed. Refer to
-README.md for more details. [default is: /usr/local/cuda]: /usr/local/cuda
-
-Please specify the cuDNN version you want to use. [Leave empty to use system
-default]: 4.0.4
-
-Please specify the location where the cuDNN 4.0.4 library is installed. Refer to
-README.md for more details. [default is: /usr/local/cuda]: /usr/local/cudnn-r4-rc/
-
-Please specify a list of comma-separated Cuda compute capabilities you want to
-build with. You can find the compute capability of your device at:
-https://developer.nvidia.com/cuda-gpus.
-Please note that each additional compute capability significantly increases your
-build time and binary size. [Default is: \"3.5,5.2\"]: 3.5
-
-Setting up Cuda include
-Setting up Cuda lib64
-Setting up Cuda bin
-Setting up Cuda nvvm
-Setting up CUPTI include
-Setting up CUPTI lib64
-Configuration finished
-```
-
-This creates a canonical set of symbolic links to the Cuda libraries on your system.
-Every time you change the Cuda library paths you need to run this step again before
-you invoke the bazel build command. For the cuDNN libraries, use '6.5' for R2, '7.0'
-for R3, and '4.0.4' for R4-RC.
-
-
-##### Build your target with GPU support
-From the root of your source tree, run:
-
-```bash
-$ bazel build -c opt --config=cuda //tensorflow/cc:tutorials_example_trainer
-
-$ bazel-bin/tensorflow/cc/tutorials_example_trainer --use_gpu
-# Lots of output. This tutorial iteratively calculates the major eigenvalue of
-# a 2x2 matrix, on GPU. The last few lines look like this.
-000009/000005 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
-000006/000001 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
-000009/000009 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
-```
-
-Note that "--config=cuda" is needed to enable the GPU support.
-
-##### Known issues
-
-* Although it is possible to build both Cuda and non-Cuda configs under the same
-source tree, we recommend to run `bazel clean` when switching between these two
-configs in the same source tree.
-
-* You have to run configure before running bazel build. Otherwise, the build
-will fail with a clear error message. In the future, we might consider making
-this more convenient by including the configure step in our build process.
-
-### Installation for Mac OS X
+### Prepare environment for Mac OS X

 We recommend using [homebrew](http://brew.sh) to install the bazel and SWIG
 dependencies, and installing python dependencies using easy_install or pip.
@ -713,15 +634,20 @@ $ sudo mv lib/libcudnn* /Developer/NVIDIA/CUDA-7.5/lib
 $ sudo ln -s /Developer/NVIDIA/CUDA-7.5/lib/libcudnn* /usr/local/cuda/lib/
 ```

-#### Configure the installation
+### Configure the installation

 Run the `configure` script at the root of the tree.  The configure script
-asks you for the path to your python interpreter.
+asks you for the path to your python interpreter and allows (optional)
+configuration of the CUDA libraries.

 This step is used to locate the python and numpy header files as well as
-enabling GPU support if you have a CUDA enabled GPU and Toolkit installed. For
-example:
+enabling GPU support if you have a CUDA enabled GPU and Toolkit installed.
+Select the option `Y` when asked to build TensorFlow with GPU support.

+If you have several versions of Cuda or cuDNN installed, you should definitely
+select one explicitly instead of relying on the system default.
+
+For example:

 ```bash
 $ ./configure
@ -748,6 +674,38 @@ Setting up CUPTI lib64
 Configuration finished
 ```

+This creates a canonical set of symbolic links to the Cuda libraries on your system.
+Every time you change the Cuda library paths you need to run this step again before
+you invoke the bazel build command. For the cuDNN libraries, use '6.5' for R2, '7.0'
+for R3, and '4.0.4' for R4-RC.
+
+#### Build your target with GPU support
+From the root of your source tree, run:
+
+```bash
+$ bazel build -c opt --config=cuda //tensorflow/cc:tutorials_example_trainer
+
+$ bazel-bin/tensorflow/cc/tutorials_example_trainer --use_gpu
+# Lots of output. This tutorial iteratively calculates the major eigenvalue of
+# a 2x2 matrix, on GPU. The last few lines look like this.
+000009/000005 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
+000006/000001 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
+000009/000009 lambda = 2.000000 x = [0.894427 -0.447214] y = [1.788854 -0.894427]
+```
+
+Note that "--config=cuda" is needed to enable the GPU support.
+
+#### Known issues
+
+* Although it is possible to build both Cuda and non-Cuda configs under the same
+source tree, we recommend to run `bazel clean` when switching between these two
+configs in the same source tree.
+
+* You have to run configure before running bazel build. Otherwise, the build
+will fail with a clear error message. In the future, we might consider making
+this more convenient by including the configure step in our build process.
+
+
 ### Create the pip package and install

 When building from source, you will still build a pip package and install that.
--- a/tensorflow/g3doc/how_tos/image_retraining/index.md
+++ b/tensorflow/g3doc/how_tos/image_retraining/index.md
@ -131,7 +131,7 @@ Once TensorBoard is running, navigate your web browser to `localhost:6006` to vi

 The script will log TensorBoard summaries to `/tmp/retrain_logs` by default. You can change the directory with the `--summaries_dir` flag.

-The [TensorBoard README](../../../tensorboard/README.md) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.
+The [TensorBoard README](https://www.tensorflow.org/code/tensorflow/tensorboard/README.md) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.

 ## Using the Retrained Model

--- a/tensorflow/g3doc/resources/index.md
+++ b/tensorflow/g3doc/resources/index.md
@ -37,6 +37,7 @@ The TensorFlow community has created many great projects around TensorFlow, incl
 * [TensorFlow tutorials](https://github.com/pkmital/tensorflow_tutorials)
 * [Scikit Flow - Simplified Interface for TensorFlow](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/learn/python/learn)
 * [Caffe to TensorFlow model converter](https://github.com/ethereon/caffe-tensorflow)
+* [Bitfusion's` GPU-enabled AWS EC2 TensorFlow AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-tensorflow) ([Launch AMI](https://aws.amazon.com/marketplace/pp/B01EYKBEQ0))

 ### Development

--- a/tensorflow/g3doc/tutorials/mnist/pros/index.md
+++ b/tensorflow/g3doc/tutorials/mnist/pros/index.md
@ -190,11 +190,11 @@ accomplished by repeatedly running `train_step`.

 ```python
 for i in range(1000):
-  batch = mnist.train.next_batch(50)
+  batch = mnist.train.next_batch(100)
  train_step.run(feed_dict={x: batch[0], y_: batch[1]})
 ```

-Each training iteration we load 50 training examples. We then run the
+Each training iteration we load 100 training examples. We then run the
 `train_step` operation, using `feed_dict` to replace the `placeholder` tensors
 `x` and `y_` with the training examples.
 Note that you can replace any tensor in your computation graph using `feed_dict`
--- a/tensorflow/g3doc/tutorials/recurrent/index.md
+++ b/tensorflow/g3doc/tutorials/recurrent/index.md
@ -178,6 +178,7 @@ https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/get_starte
 [bazel](https://github.com/bazelbuild/bazel)).

 Next:
+
 ```bash
 cd tensorflow/models/rnn/ptb
 python ptb_word_lm.py --data_path=/tmp/simple-examples/data/ --model small
--- a/tensorflow/g3doc/tutorials/tflearn/index.md
+++ b/tensorflow/g3doc/tutorials/tflearn/index.md
@ -240,10 +240,11 @@ second sample is *Iris virginica*.
 * For further reference materials on tf.contrib.learn, see the official
 [API docs](../../api_docs/python/contrib.learn.md).

-<!-- David, will the below be live when this tutorial is released? -->
 * To learn more about using tf.contrib.learn to create linear models, see 
 [Large-scale Linear Models with TensorFlow](../linear/).

+* To build your own Estimator using tf.contrib.learn APIs, check out [Building Machine Learning Estimator in TensorFlow](http://terrytangyuan.github.io/2016/07/08/understand-and-build-tensorflow-estimator/).
+
 * To experiment with neural network modeling and visualization in the browser,
 check out [Deep Playground](http://playground.tensorflow.org/).

--- a/tensorflow/models/embedding/word2vec.py
+++ b/tensorflow/models/embedding/word2vec.py
@ -378,7 +378,8 @@ class Word2Vec(object):
    opts = self._options
    with open(os.path.join(opts.save_path, "vocab.txt"), "w") as f:
      for i in xrange(opts.vocab_size):
-        f.write("%s %d\n" % (tf.compat.as_text(opts.vocab_words[i]),
+        vocab_word = tf.compat.as_text(opts.vocab_words[i]).encode("utf-8")
+        f.write("%s %d\n" % (vocab_word,
                             opts.vocab_counts[i]))

  def _train_thread_body(self):
--- a/tensorflow/models/image/mnist/convolutional.py
+++ b/tensorflow/models/image/mnist/convolutional.py
@ -82,10 +82,10 @@ def extract_data(filename, num_images):
  print('Extracting', filename)
  with gzip.open(filename) as bytestream:
    bytestream.read(16)
-    buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images)
+    buf = bytestream.read(IMAGE_SIZE * IMAGE_SIZE * num_images * NUM_CHANNELS)
    data = numpy.frombuffer(buf, dtype=numpy.uint8).astype(numpy.float32)
    data = (data - (PIXEL_DEPTH / 2.0)) / PIXEL_DEPTH
-    data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, 1)
+    data = data.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)
    return data


--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@ -146,6 +146,7 @@ cuda_py_tests(
        "reverse_sequence_op_test.py",
        "rnn_cell_test.py",
        "scalar_strict_test.py",
+        "scan_ops_test.py",
        "session_ops_test.py",
        "shape_ops_test.py",
        "softmax_op_test.py",
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@ -198,14 +198,19 @@ class ReverseTest(test_util.TensorFlowTestCase):
        x_tf = array_ops.reverse(x_np, []).eval()
        self.assertAllEqual(x_tf, x_np)

-  def testReverse1DimAuto(self):
-    x_np = [1, 4, 9]
+  def _reverse1DimAuto(self, np_dtype):
+    x_np = np.array([1, 2, 3, 4, 5], dtype=np_dtype)

    for use_gpu in [False, True]:
      with self.test_session(use_gpu=use_gpu):
        x_tf = array_ops.reverse(x_np, [True]).eval()
        self.assertAllEqual(x_tf, np.asarray(x_np)[::-1])

+  def testReverse1DimAuto(self):
+    for dtype in [np.uint8, np.int8, np.int32, np.bool, np.float16,
+                  np.float32, np.float64, np.complex64, np.complex128]:
+      self._reverse1DimAuto(dtype)
+
  def testUnknownDims(self):
    data_t = tf.placeholder(tf.float32)
    dims_known_t = tf.placeholder(tf.bool, shape=[3])
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@ -432,16 +432,13 @@ class ProdReductionTest(tf.test.TestCase):
    self._compareAll(np_arr, [0, 2])
    self._compareAll(np_arr, [0, 1, 2])

-  def testGradient(self):
-    s = [2, 3, 4, 2]
-    # NOTE(kearnes): divide by 20 so product is a reasonable size
-    x = np.arange(1.0, 49.0).reshape(s).astype(np.float32) / 20.
+  def _compareGradient(self, x):
    with self.test_session():
      t = tf.convert_to_tensor(x)

      su = tf.reduce_prod(t, [])
      jacob_t, jacob_n = tf.test.compute_gradient(t,
-                                                  s,
+                                                  x.shape,
                                                  su,
                                                  [2, 3, 4, 2],
                                                  x_init_value=x,
@ -450,7 +447,7 @@ class ProdReductionTest(tf.test.TestCase):

      su = tf.reduce_prod(t, [1, 2])
      jacob_t, jacob_n = tf.test.compute_gradient(t,
-                                                  s,
+                                                  x.shape,
                                                  su,
                                                  [2, 2],
                                                  x_init_value=x,
@ -459,26 +456,34 @@ class ProdReductionTest(tf.test.TestCase):

      su = tf.reduce_prod(t, [0, 1, 2, 3])
      jacob_t, jacob_n = tf.test.compute_gradient(t,
-                                                  s,
+                                                  x.shape,
                                                  su,
                                                  [1],
                                                  x_init_value=x,
                                                  delta=1)
      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)

-    # NOTE(kearnes): the current gradient calculation gives NaNs for 0 inputs
-    x = np.arange(0.0, 48.0).reshape(s).astype(np.float32) / 20.
-    with self.test_session():
-      t = tf.convert_to_tensor(x)
-      su = tf.reduce_prod(t, [])
-      jacob_t, _ = tf.test.compute_gradient(t,
-                                            s,
-                                            su,
-                                            [2, 3, 4, 2],
-                                            x_init_value=x,
-                                            delta=1)
-      with self.assertRaisesOpError("Tensor had NaN values"):
-        tf.check_numerics(jacob_t, message="_ProdGrad NaN test").op.run()
+  def testGradientWithZeros(self):
+    s = [2, 3, 4, 2]
+    x = np.arange(1.0, 49.0).reshape(s).astype(np.float32) / 20.
+    # No zeros in input
+    self._compareGradient(x)
+    # Zero at beginning
+    x1 = x.copy()
+    x1[:,:,0,:] = 0
+    self._compareGradient(x1)
+    # Zero at end
+    x2 = x.copy()
+    x2[:,:,-1,:] = 0
+    self._compareGradient(x2)
+    # Zero in middle
+    x3 = x.copy()
+    x3[:,:,2,:] = 0
+    self._compareGradient(x3)
+    # All zeros
+    x4 = x.copy()
+    x4[:,:,:,:] = 0
+    self._compareGradient(x4)

  def testEmptyGradients(self):
    with self.test_session():
--- a/tensorflow/python/kernel_tests/scan_ops_test.py
+++ b/tensorflow/python/kernel_tests/scan_ops_test.py
@ -0,0 +1,229 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functional tests for scan ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from itertools import combinations
+
+import numpy as np
+import tensorflow as tf
+
+
+def numpy_reverse(x, axis):
+  ix = [slice(None, None, -1)
+        if i == axis else slice(None) for i in range(len(x.shape))]
+  return x[ix]
+
+def handle_options(func, x, axis, exclusive, reverse):
+  """Adds tf options to numpy scan ops"""
+  if reverse:
+    x = numpy_reverse(x, axis)
+
+  if exclusive:
+    ix_head = [slice(0, 1) if i == axis else slice(None)
+                 for i in range(len(x.shape))]
+    ix_init = [slice(0, -1) if i == axis else slice(None)
+                 for i in range(len(x.shape))]
+    if func == np.cumsum:
+      init = np.zeros_like(x[ix_head])
+    elif func == np.cumprod:
+      init = np.ones_like(x[ix_head])
+    else:
+      raise ValueError("Unknown scan function")
+    x = np.concatenate([init, func(x[ix_init], axis)], axis=axis)
+  else:
+    x = func(x, axis=axis)
+
+  if reverse:
+    x = numpy_reverse(x, axis)
+  return x
+
+class CumsumTest(tf.test.TestCase):
+
+  valid_dtypes = [np.int32, np.int64, np.float16, np.float32,
+                  np.float64, np.complex64, np.complex128]
+
+  def _compare(self, x, axis, exclusive, reverse, use_gpu=False):
+    np_out = handle_options(np.cumsum, x, axis, exclusive, reverse)
+    with self.test_session(use_gpu=use_gpu):
+      tf_out = tf.cumsum(x, axis, exclusive, reverse).eval()
+
+    self.assertAllClose(np_out, tf_out)
+
+  def _compareAll(self, x, axis):
+    for exclusive in [True, False]:
+      for reverse in [True, False]:
+        for use_gpu in [True, False]:
+          self._compare(x, axis, exclusive, reverse, use_gpu)
+
+  def test1D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      self._compareAll(x, 0)
+
+  def test2D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(0, 10).reshape([2, 5]).astype(dtype)
+      self._compareAll(x, 0)
+      self._compareAll(x, 1)
+
+  def test3D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(0, 20).reshape([2, 2, 5]).astype(dtype)
+      self._compareAll(x, 0)
+      self._compareAll(x, 1)
+      self._compareAll(x, 2)
+
+  def testInvalidAxis(self):
+    x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
+    input_tensor = tf.convert_to_tensor(x)
+    with self.test_session():
+      with self.assertRaisesWithPredicateMatch(
+          tf.errors.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range" in str(e)):
+        tf.cumsum(input_tensor, -1).eval()
+      with self.assertRaisesWithPredicateMatch(
+          tf.errors.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range" in str(e)):
+        tf.cumsum(input_tensor, 2).eval()
+      with self.assertRaisesWithPredicateMatch(
+          tf.errors.InvalidArgumentError,
+          lambda e: "axis must be a scalar" in str(e)):
+        tf.cumsum(input_tensor, [0]).eval()
+
+  def _compareGradient(self, shape, axis, exclusive, reverse):
+    x = np.arange(0, 50).reshape(shape).astype(np.float64)
+    with self.test_session():
+      t = tf.convert_to_tensor(x)
+      result = tf.cumsum(t, axis, exclusive, reverse)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  shape,
+                                                  result,
+                                                  shape,
+                                                  x_init_value=x,
+                                                  delta=1)
+    self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
+
+  def testGradient(self):
+    self._compareGradient([50], 0, False, False)
+
+  def testGradientReverse(self):
+    self._compareGradient([50], 0, False, True)
+
+  def testGradientExclusive(self):
+    self._compareGradient([50], 0, True, False)
+
+  def testGradientExclusiveReverse(self):
+    self._compareGradient([50], 0, True, True)
+
+  def testGradient2D(self):
+    for axis in [0, 1]:
+      for exclusive in [True, False]:
+        for reverse in [True, False]:
+          self._compareGradient([5, 10], axis, exclusive, reverse)
+
+
+class CumprodTest(tf.test.TestCase):
+
+  valid_dtypes = [np.int32, np.int64, np.float16, np.float32,
+                  np.float64, np.complex64, np.complex128]
+
+  def _compare(self, x, axis, exclusive, reverse, use_gpu=False):
+    np_out = handle_options(np.cumprod, x, axis, exclusive, reverse)
+    with self.test_session(use_gpu=use_gpu):
+      tf_out = tf.cumprod(x, axis, exclusive, reverse).eval()
+
+    self.assertAllClose(np_out, tf_out)
+
+  def _compareAll(self, x, axis):
+    for exclusive in [True, False]:
+      for reverse in [True, False]:
+        for use_gpu in [True, False]:
+          self._compare(x, axis, exclusive, reverse, use_gpu)
+
+
+  def test1D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      self._compareAll(x, 0)
+
+  def test2D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 11).reshape([2, 5]).astype(dtype)
+      self._compareAll(x, 0)
+      self._compareAll(x, 1)
+
+  def test3D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 21).reshape([2, 2, 5]).astype(dtype)
+      self._compareAll(x, 0)
+      self._compareAll(x, 1)
+      self._compareAll(x, 2)
+
+  def testInvalidAxis(self):
+    x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
+    input_tensor = tf.convert_to_tensor(x)
+    with self.test_session():
+      with self.assertRaisesWithPredicateMatch(
+          tf.errors.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range" in str(e)):
+        tf.cumprod(input_tensor, -1).eval()
+      with self.assertRaisesWithPredicateMatch(
+          tf.errors.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range" in str(e)):
+        tf.cumprod(input_tensor, 2).eval()
+      with self.assertRaisesWithPredicateMatch(
+          tf.errors.InvalidArgumentError,
+          lambda e: "axis must be a scalar" in str(e)):
+        tf.cumprod(input_tensor, [0]).eval()
+
+  def _compareGradient(self, shape, axis, exclusive, reverse):
+    x = np.arange(1, 9).reshape(shape).astype(np.float64)
+    with self.test_session():
+      t = tf.convert_to_tensor(x)
+      result = tf.cumprod(t, axis, exclusive, reverse)
+      jacob_t, jacob_n = tf.test.compute_gradient(t,
+                                                  shape,
+                                                  result,
+                                                  shape,
+                                                  x_init_value=x,
+                                                  delta=1)
+    self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
+
+  def testGradient(self):
+    self._compareGradient([8], 0, False, False)
+
+  def testGradientReverse(self):
+    self._compareGradient([8], 0, False, True)
+
+  def testGradientExclusive(self):
+    self._compareGradient([8], 0, True, False)
+
+  def testGradientExclusiveReverse(self):
+    self._compareGradient([8], 0, True, True)
+
+  def testGradient2D(self):
+    for axis in [0, 1]:
+      for exclusive in [True, False]:
+        for reverse in [True, False]:
+          self._compareGradient([2, 4], axis, exclusive, reverse)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@ -1021,6 +1021,12 @@ def _ResizeShape(op):
  return [tensor_shape.TensorShape(
      [input_shape[0], height, width, input_shape[3]])]

+@ops.RegisterShape('DecodeGif')
+def _ImageDecodeShape(op):
+  """Shape function for decode gif."""
+  unused_input_shape = op.inputs[0].get_shape().merge_with(
+      tensor_shape.scalar())
+  return [tensor_shape.TensorShape([None, None, None, 3])]

@ops.RegisterShape('DecodeJpeg')
@ops.RegisterShape('DecodePng')
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@ -27,6 +27,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin

 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@ -42,34 +43,37 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
    np.random.seed(7)
    batch_size = 5
    shape = (batch_size, 2, 7, 3)
-    inp = np.random.rand(*shape).astype(np.float32)

-    # Convert to HSV and back, as a batch and individually
-    with self.test_session() as sess:
-      batch0 = constant_op.constant(inp)
-      batch1 = image_ops.rgb_to_hsv(batch0)
-      batch2 = image_ops.hsv_to_rgb(batch1)
-      split0 = array_ops.unpack(batch0)
-      split1 = list(map(image_ops.rgb_to_hsv, split0))
-      split2 = list(map(image_ops.hsv_to_rgb, split1))
-      join1 = array_ops.pack(split1)
-      join2 = array_ops.pack(split2)
-      batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+    for nptype in [np.float32, np.float64]:
+      inp = np.random.rand(*shape).astype(nptype)

-    # Verify that processing batch elements together is the same as separate
-    self.assertAllClose(batch1, join1)
-    self.assertAllClose(batch2, join2)
-    self.assertAllClose(batch2, inp)
+      # Convert to HSV and back, as a batch and individually
+      with self.test_session() as sess:
+        batch0 = constant_op.constant(inp)
+        batch1 = image_ops.rgb_to_hsv(batch0)
+        batch2 = image_ops.hsv_to_rgb(batch1)
+        split0 = array_ops.unpack(batch0)
+        split1 = list(map(image_ops.rgb_to_hsv, split0))
+        split2 = list(map(image_ops.hsv_to_rgb, split1))
+        join1 = array_ops.pack(split1)
+        join2 = array_ops.pack(split2)
+        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+
+      # Verify that processing batch elements together is the same as separate
+      self.assertAllClose(batch1, join1)
+      self.assertAllClose(batch2, join2)
+      self.assertAllClose(batch2, inp)

  def testRGBToHSVRoundTrip(self):
    data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
-    rgb_np = np.array(data, dtype=np.float32).reshape([2, 2, 3]) / 255.
-    for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
-        hsv = image_ops.rgb_to_hsv(rgb_np)
-        rgb = image_ops.hsv_to_rgb(hsv)
-        rgb_tf = rgb.eval()
-    self.assertAllClose(rgb_tf, rgb_np)
+    for nptype in [np.float32, np.float64]:
+      rgb_np = np.array(data, dtype=nptype).reshape([2, 2, 3]) / 255.
+      for use_gpu in [True, False]:
+        with self.test_session(use_gpu=use_gpu):
+          hsv = image_ops.rgb_to_hsv(rgb_np)
+          rgb = image_ops.hsv_to_rgb(hsv)
+          rgb_tf = rgb.eval()
+      self.assertAllClose(rgb_tf, rgb_np)


 class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
@ -1609,6 +1613,56 @@ class PngTest(test_util.TensorFlowTestCase):
                         [None, None, channels or None])


+class GifTest(test_util.TensorFlowTestCase):
+
+  def testValid(self):
+    # Read some real GIFs
+    prefix = 'tensorflow/core/lib/gif/testdata/'
+    filename = 'scan.gif'
+    WIDTH = 20
+    HEIGHT = 40
+    STRIDE = 5
+    shape = (12, HEIGHT, WIDTH, 3)
+
+    with self.test_session() as sess:
+      gif0 = io_ops.read_file(prefix + filename)
+      image0 = image_ops.decode_gif(gif0)
+      gif0, image0 = sess.run([gif0, image0])
+
+      self.assertEqual(image0.shape, shape)
+
+      for frame_idx, frame in enumerate(image0):
+        gt = np.zeros(shape[1:], dtype=np.uint8)
+        start = frame_idx * STRIDE
+        end = (frame_idx + 1) * STRIDE
+        print(frame_idx)
+        if end <= WIDTH:
+          gt[:, start:end, :] = 255
+        else:
+          start -= WIDTH
+          end -= WIDTH
+          gt[start:end, :, :] = 255
+
+        self.assertAllClose(frame, gt)
+
+  def testInValid(self):
+    # Read some real GIFs
+    prefix = 'tensorflow/core/lib/gif/testdata/'
+    filename = 'optimized.gif'
+
+    with self.test_session() as sess:
+      gif0 = io_ops.read_file(prefix + filename)
+      image0 = image_ops.decode_gif(gif0)
+      with self.assertRaises(errors.InvalidArgumentError):
+        gif0, image0 = sess.run([gif0, image0])
+
+  def testShape(self):
+      with self.test_session() as sess:
+        gif = constant_op.constant('nonsense')
+        image = image_ops.decode_gif(gif)
+        self.assertEqual(image.get_shape().as_list(),
+                [None, None, None, 3])
+
 class ConvertImageTest(test_util.TensorFlowTestCase):

  def _convert(self, original, original_dtype, output_dtype, expected):
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@ -109,13 +109,41 @@ def _MeanGrad(op, grad):
@ops.RegisterGradient("Prod")
 def _ProdGrad(op, grad):
  """Gradient for Prod."""
-  # TODO(kearnes): this gives NaNs for 0s in the input tensor
+  # The gradient can be expressed by dividing the product by each entry of the
+  # input tensor, but this approach can't deal with zeros in the input.
+  # Here, we avoid this problem by composing the output as a product of two
+  # cumprod operations.
+
  input_shape = array_ops.shape(op.inputs[0])
+
+  # Expand grad to full input shape
  output_shape_kept_dims = math_ops.reduced_shape(input_shape, op.inputs[1])
  tile_scaling = _safe_shape_div(input_shape, output_shape_kept_dims)
-  grad = array_ops.reshape(grad * op.outputs[0], output_shape_kept_dims)
-  grad = math_ops.div(array_ops.tile(grad, tile_scaling), op.inputs[0])
-  return grad, None
+  grad = array_ops.reshape(grad, output_shape_kept_dims)
+  grad = array_ops.tile(grad, tile_scaling)
+
+  # Pack all reduced dimensions into a single one, so we can perform the
+  # cumprod ops. If the reduction dims list is empty, it defaults to float32,
+  # so we need to cast here.
+  reduced = math_ops.cast(op.inputs[1], dtypes.int32)
+  idx = math_ops.range(0, array_ops.rank(op.inputs[0]))
+  other, _ = array_ops.listdiff(idx, reduced)
+  perm = array_ops.concat(0, [reduced, other])
+  reduced_num = math_ops.reduce_prod(array_ops.gather(input_shape, reduced))
+  other_num = math_ops.reduce_prod(array_ops.gather(input_shape, other))
+  permuted = array_ops.transpose(op.inputs[0], perm)
+  permuted_shape = array_ops.shape(permuted)
+  reshaped = array_ops.reshape(permuted, (reduced_num, other_num))
+
+  # Calculate product, leaving out the current entry
+  left = math_ops.cumprod(reshaped, axis=0, exclusive=True)
+  right = math_ops.cumprod(reshaped, axis=0, exclusive=True, reverse=True)
+  y = array_ops.reshape(left * right, permuted_shape)
+
+  # Invert the transpose and reshape operations.
+  # Make sure to set the statically known shape information through a reshape.
+  out = grad * array_ops.transpose(y, array_ops.invert_permutation(perm))
+  return array_ops.reshape(out, input_shape), None


@ops.RegisterGradient("SegmentSum")
@ -839,3 +867,26 @@ def _CrossGrad(op, grad):
  u = op.inputs[0]
  v = op.inputs[1]
  return (math_ops.cross(v, grad), math_ops.cross(grad, u))
+
+
+@ops.RegisterGradient("Cumsum")
+def _CumsumGrad(op, grad):
+  axis = op.inputs[1]
+  exclusive = op.get_attr("exclusive")
+  reverse = op.get_attr("reverse")
+  return [math_ops.cumsum(grad, axis, exclusive=exclusive,
+                          reverse=not reverse), None]
+
+
+@ops.RegisterGradient("Cumprod")
+def _CumprodGrad(op, grad):
+  x = op.inputs[0]
+  axis = op.inputs[1]
+  exclusive = op.get_attr("exclusive")
+  reverse = op.get_attr("reverse")
+
+  # TODO This fails when x contains 0 and should be fixed
+  prod = math_ops.cumprod(x, axis, exclusive=exclusive, reverse=reverse)
+  out = math_ops.cumsum(prod * grad, axis, exclusive=exclusive,
+                        reverse=not reverse)
+  return [out / x, None]
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@ -13,7 +13,10 @@
 # limitations under the License.
 # ==============================================================================

-"""## Arithmetic Operators
+"""Note: Elementwise binary operations in TensorFlow follow [numpy-style
+broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+
+## Arithmetic Operators

 TensorFlow provides several operations that you can use to add basic arithmetic
 operators to your graph.
@ -145,6 +148,14 @@ common math computations that reduce various dimensions of a tensor.

@@accumulate_n

+## Scan
+
+TensorFlow provides several operations that you can use to perform scans
+(running totals) across one axis of a tensor.
+
+@@cumsum
+@@cumprod
+
 ## Segmentation

 TensorFlow provides several operations that you can use to perform common
@ -1585,6 +1596,94 @@ def tanh(x, name=None):
      return gen_math_ops._tanh(x, name=name)


+def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
+    """Compute the cumulative sum of the tensor `x` along `axis`.
+
+    By default, this op performs an inclusive cumsum, which means that the first
+    element of the input is identical to the first element of the output:
+    ```prettyprint
+    tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
+    ```
+
+    By setting the `exclusive` kwarg to `True`, an exclusive cumsum is performed
+    instead:
+    ```prettyprint
+    tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
+    ```
+
+    By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+    opposite direction:
+    ```prettyprint
+    tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
+    ```
+    This is more efficient than using separate `tf.reverse` ops.
+
+    The `reverse` and `exclusive` kwargs can also be combined:
+    ```prettyprint
+    tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
+    ```
+
+    Args:
+      x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+       `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+       `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+      axis: A `Tensor` of type `int32` (default: 0).
+      reverse: A `bool` (default: False).
+      name: A name for the operation (optional).
+
+    Returns:
+      A `Tensor`. Has the same type as `x`.
+    """
+    with ops.op_scope([x], name, "Cumsum") as name:
+      x = ops.convert_to_tensor(x, name="x")
+      return gen_math_ops.cumsum(x, axis, exclusive=exclusive,
+                                 reverse=reverse, name=name)
+
+
+def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
+    """Compute the cumulative product of the tensor `x` along `axis`.
+
+    By default, this op performs an inclusive cumprod, which means that the first
+    element of the input is identical to the first element of the output:
+    ```prettyprint
+    tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
+    ```
+
+    By setting the `exclusive` kwarg to `True`, an exclusive cumprod is performed
+    instead:
+    ```prettyprint
+    tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
+    ```
+
+    By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+    opposite direction:
+    ```prettyprint
+    tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
+    ```
+    This is more efficient than using separate `tf.reverse` ops.
+
+    The `reverse` and `exclusive` kwargs can also be combined:
+    ```prettyprint
+    tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
+    ```
+
+    Args:
+      x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+       `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+       `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+      axis: A `Tensor` of type `int32` (default: 0).
+      reverse: A `bool` (default: False).
+      name: A name for the operation (optional).
+
+    Returns:
+      A `Tensor`. Has the same type as `x`.
+    """
+    with ops.op_scope([x], name, "Cumprod") as name:
+      x = ops.convert_to_tensor(x, name="x")
+      return gen_math_ops.cumprod(x, axis, exclusive=exclusive,
+                                  reverse=reverse, name=name)
+
+
 ops.RegisterShape("Abs")(common_shapes.unchanged_shape)
 ops.RegisterShape("Acos")(common_shapes.unchanged_shape)
 ops.RegisterShape("Asin")(common_shapes.unchanged_shape)
@ -1632,6 +1731,8 @@ ops.RegisterShape("BatchFFT3D")(common_shapes.unchanged_shape)
 ops.RegisterShape("BatchIFFT3D")(common_shapes.unchanged_shape)
 ops.RegisterShape("TanhGrad")(common_shapes.unchanged_shape)
 ops.RegisterShape("SigmoidGrad")(common_shapes.unchanged_shape)
+ops.RegisterShape("Cumsum")(common_shapes.unchanged_shape)
+ops.RegisterShape("Cumprod")(common_shapes.unchanged_shape)


@ops.RegisterShape("Add")
--- a/tensorflow/python/ops/rnn_cell.py
+++ b/tensorflow/python/ops/rnn_cell.py
@ -648,7 +648,7 @@ class DropoutWrapper(RNNCell):
                       % input_keep_prob)
    if (isinstance(output_keep_prob, float) and
        not (output_keep_prob >= 0.0 and output_keep_prob <= 1.0)):
-      raise ValueError("Parameter input_keep_prob must be between 0 and 1: %d"
+      raise ValueError("Parameter output_keep_prob must be between 0 and 1: %d"
                       % output_keep_prob)
    self._cell = cell
    self._input_keep_prob = input_keep_prob
--- a/tensorflow/python/platform/default/init.py
+++ b/tensorflow/python/platform/default/init.py
--- a/tensorflow/python/platform/gfile.py
+++ b/tensorflow/python/platform/gfile.py
@ -395,13 +395,14 @@ def Walk(top, topdown=1, onerror=None):
  optional argument "onerror" is specified, it should be a function.  It
  will be called with one argument, an os.error instance.  It can return
  to continue with the walk, or reraise the exception to abort the walk.
+  By default, the walk follows symlinks that resolve into directories.

  Yields:
    # Each yield is a 3-tuple:  the pathname of a directory, followed
    # by lists of all its subdirectories and leaf files.
    (dirname, [subdirname, subdirname, ...], [filename, filename, ...])
  """
-  return os.walk(top, topdown=topdown, onerror=onerror)
+  return os.walk(top, topdown=topdown, onerror=onerror, followlinks=True)


 def Stat(path):   # pylint: disable=invalid-name
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@ -92,7 +92,7 @@ def input_producer(input_tensor, element_shape=None, num_epochs=None,
  """Output the rows of `input_tensor` to a queue for an input pipeline.

  Args:
-    input_tensor: A tensor with the rows to produce. Must be at
+    input_tensor: A tensor with the rows to produce. Must be at least
      one-dimensional. Must either have a fully-defined shape, or
      `element_shape` must be defined.
    element_shape: (Optional.) A `TensorShape` representing the shape of a
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import control_flow_ops
@ -40,7 +41,7 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
                          decay_rate ^ (global_step / decay_steps)
  ```

-  If the argument `staircase` is `True`, then `global_step /decay_steps` is an
+  If the argument `staircase` is `True`, then `global_step / decay_steps` is an
  integer division and the decayed learning rate follows a staircase function.

  Example: decay every 100000 steps with a base of 0.96:
@ -67,15 +68,16 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
      Must be positive.  See the decay computation above.
    decay_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The decay rate.
-    staircase: Boolean.  It `True` decay the learning rate at discrete intervals.
-    name: String.  Optional name of the operation.  Defaults to 'ExponentialDecay'
+    staircase: Boolean.  It `True` decay the learning rate at discrete intervals
+    name: String.  Optional name of the operation.  Defaults to 
+      'ExponentialDecay'

  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.
  """
  with ops.op_scope([learning_rate, global_step, decay_steps, decay_rate],
-                   name, "ExponentialDecay") as name:
+                    name, "ExponentialDecay") as name:
    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    dtype = learning_rate.dtype
    global_step = math_ops.cast(global_step, dtype)
@ -237,3 +239,125 @@ def polynomial_decay(learning_rate, global_step, decay_steps,
    return math_ops.add(math_ops.mul(learning_rate - end_learning_rate,
                                     math_ops.pow(1 - p, power)),
                        end_learning_rate, name=name)
+
+
+def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate,
+                      staircase=False, name=None):
+  """Applies natural exponential decay to the initial learning rate.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses.  This function applies an exponential decay function
+  to a provided initial learning rate.  It requires an `global_step` value to
+  compute the decayed learning rate.  You can just pass a TensorFlow variable
+  that you increment at each training step.
+
+  The function returns the decayed learning rate.  It is computed as:
+
+  ```python
+  decayed_learning_rate = learning_rate * exp(-decay_rate * global_step)
+  ```
+
+  Example: decay exponetially with a base of 0.96:
+
+  ```python
+  ...
+  global_step = tf.Variable(0, trainable=False)
+  learning_rate = 0.1
+  k = 0.5
+  learning_rate = tf.train.exponential_time_decay(learning_rate, global_step, k)
+
+  # Passing global_step to minimize() will increment it at each step.
+  learning_step = (
+      tf.GradientDescentOptimizer(learning_rate)
+      .minimize(...my loss..., global_step=global_step)
+  )
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` `Tensor` or a
+      Python number.  The initial learning rate.
+    global_step: A Python number.
+      Global step to use for the decay computation.  Must not be negative.
+    decay_rate: A Python number.  The decay rate.
+    name: String.  Optional name of the operation.  Defaults to
+      'ExponentialTimeDecay'
+
+  Returns:
+    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+    learning rate.
+  """
+  with ops.op_scope([learning_rate, global_step, decay_rate],
+                    name, "NaturalExpDecay") as name:
+    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
+    dtype = learning_rate.dtype
+    global_step = math_ops.cast(global_step, dtype)
+    decay_steps = math_ops.cast(decay_steps, dtype)
+    decay_rate = math_ops.cast(decay_rate, dtype)
+    p = global_step / decay_steps
+    if staircase:
+      p = math_ops.floor(p)
+    exponent = math_ops.exp(math_ops.mul(math_ops.neg(decay_rate), p))
+    return math_ops.mul(learning_rate, exponent, name=name)
+
+
+def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
+                       staircase=False, name=None):
+  """Applies inverse time decay to the initial learning rate.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses.  This function applies an inverse decay function
+  to a provided initial learning rate.  It requires an `global_step` value to
+  compute the decayed learning rate.  You can just pass a TensorFlow variable
+  that you increment at each training step.
+
+  The function returns the decayed learning rate.  It is computed as:
+
+  ```python
+  decayed_learning_rate = learning_rate / (1 + decay_rate * t)
+  ```
+
+  Example: decay 1/t with a rate of 0.5:
+
+  ```python
+  ...
+  global_step = tf.Variable(0, trainable=False)
+  learning_rate = 0.1
+  k = 0.5
+  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, k)
+
+  # Passing global_step to minimize() will increment it at each step.
+  learning_step = (
+      tf.GradientDescentOptimizer(learning_rate)
+      .minimize(...my loss..., global_step=global_step)
+  )
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` `Tensor` or a
+      Python number.  The initial learning rate.
+    global_step: A Python number.
+      Global step to use for the decay computation.  Must not be negative.
+    decay_rate: A Python number.  The decay rate.
+    name: String.  Optional name of the operation.  Defaults to
+      'InverseTimeDecay'
+
+  with ops.op_scope([learning_rate, global_step, decay_rate],
+                    name, "InverseTimeDecay") as name:
+  Returns:
+    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+    learning rate.
+  """
+
+  with ops.op_scope([learning_rate, global_step, decay_rate],
+                    name, "InverseTimeDecay") as name:
+    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
+    dtype = learning_rate.dtype
+    global_step = math_ops.cast(global_step, dtype)
+    decay_steps = math_ops.cast(decay_steps, dtype)
+    decay_rate = math_ops.cast(decay_rate, dtype)
+    p = global_step / decay_steps
+    if staircase:
+      p = math_ops.floor(p)
+    const = math_ops.cast(constant_op.constant(1), learning_rate.dtype)
+    denom = math_ops.add(const, math_ops.mul(decay_rate, p))
+    return math_ops.div(learning_rate, denom, name=name)
--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import math
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
@ -50,7 +52,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
      self.assertAllClose(decayed_lr.eval(), .1, 1e-6)
      # Decayed learning rate
      assign_100.op.run()
-      expected = .1 * 0.96**(100 // 3)
+      expected = .1 * 0.96 ** (100 // 3)
      self.assertAllClose(decayed_lr.eval(), expected, 1e-6)

  def testVariables(self):
@ -69,7 +71,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
      self.assertAllClose(decayed_lr.eval(), .1, 1e-6)
      # Decayed learning rate
      assign_100.op.run()
-      expected = .1 * 0.96**(100 // 3)
+      expected = .1 * 0.96 ** (100 // 3)
      self.assertAllClose(decayed_lr.eval(), expected, 1e-6)

  def testPiecewiseConstant(self):
@ -215,5 +217,83 @@ class SqrtDecayTest(test_util.TensorFlowTestCase):
      self.assertAllClose(decayed_lr.eval(), expected, 1e-6)


+class ExponentialDecayTest(test_util.TensorFlowTestCase):
+
+  def testDecay(self):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = state_ops.variable_op([], dtypes.int32)
+    assign_step = state_ops.assign(step, 0)
+    increment_step = state_ops.assign_add(step, 1)
+    decayed_lr = learning_rate_decay.natural_exp_decay(initial_lr, step,
+                                                       k, decay_rate)
+    with self.test_session():
+      assign_step.op.run()
+      for i in range(k+1):
+        expected = initial_lr * math.exp(-i / k * decay_rate)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+        increment_step.op.run()
+
+  def testStaircase(self):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = state_ops.variable_op([], dtypes.int32)
+    assign_step = state_ops.assign(step, 0)
+    increment_step = state_ops.assign_add(step, 1)
+    decayed_lr = learning_rate_decay.natural_exp_decay(initial_lr,
+                                                       step,
+                                                       k,
+                                                       decay_rate,
+                                                       staircase=True)
+    with self.test_session():
+      assign_step.op.run()
+      for i in range(k+1):
+        expected = initial_lr * math.exp(-decay_rate * (i // k))
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+        increment_step.op.run()
+
+
+class InverseDecayTest(test_util.TensorFlowTestCase):
+
+  def testDecay(self):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = state_ops.variable_op([], dtypes.int32)
+    assign_step = state_ops.assign(step, 0)
+    increment_step = state_ops.assign_add(step, 1)
+    decayed_lr = learning_rate_decay.inverse_time_decay(initial_lr,
+                                                        step,
+                                                        k,
+                                                        decay_rate)
+    with self.test_session():
+      assign_step.op.run()
+      for i in range(k+1):
+        expected = initial_lr / (1 + i / k * decay_rate)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+        increment_step.op.run()
+
+  def testStaircase(self):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = state_ops.variable_op([], dtypes.int32)
+    assign_step = state_ops.assign(step, 0)
+    increment_step = state_ops.assign_add(step, 1)
+    decayed_lr = learning_rate_decay.inverse_time_decay(initial_lr,
+                                                        step,
+                                                        k,
+                                                        decay_rate,
+                                                        staircase=True)
+    with self.test_session():
+      assign_step.op.run()
+      for i in range(k+1):
+        expected = initial_lr / (1 + decay_rate * (i // k))
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+        increment_step.op.run()
+
+
 if __name__ == "__main__":
  googletest.main()
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@ -31,7 +31,7 @@ class MomentumOptimizer(optimizer.Optimizer):
  """

  def __init__(self, learning_rate, momentum,
-               use_locking=False, name="Momentum"):
+               use_locking=False, name="Momentum", use_nesterov=False):
    """Construct a new Momentum optimizer.

    Args:
@ -44,6 +44,7 @@ class MomentumOptimizer(optimizer.Optimizer):
    super(MomentumOptimizer, self).__init__(use_locking, name)
    self._learning_rate = learning_rate
    self._momentum = momentum
+    self._use_nesterov = use_nesterov

  def _create_slots(self, var_list):
    for v in var_list:
@ -62,7 +63,8 @@ class MomentumOptimizer(optimizer.Optimizer):
        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
        grad,
        math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
-        use_locking=self._use_locking).op
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov).op

  def _apply_sparse(self, grad, var):
    mom = self.get_slot(var, "momentum")
@ -71,4 +73,5 @@ class MomentumOptimizer(optimizer.Optimizer):
        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
        grad.values, grad.indices,
        math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
-        use_locking=self._use_locking).op
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov).op
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@ -25,6 +25,13 @@ import tensorflow as tf

 class MomentumOptimizerTest(tf.test.TestCase):

+  def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
+    var = var + accum * lr * momentum
+    accum = accum * momentum + g
+    var = var - lr * accum
+    var = var - accum * lr * momentum
+    return var, accum
+
  def testBasic(self):
    for dtype in [tf.half, tf.float32, tf.float64]:
      with self.test_session():
@ -80,6 +87,68 @@ class MomentumOptimizerTest(tf.test.TestCase):
                      3.98 - ((0.9 * 0.01 + 0.01) * 2.0)]),
            var1.eval())

+  def testNesterovMomentum(self):
+    for dtype in [tf.float32, tf.float64]:
+      with self.test_session():
+        var0 = tf.Variable([1.0, 2.0], dtype=dtype)
+        var1 = tf.Variable([3.0, 4.0], dtype=dtype)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        cost = 5 * var0 * var0 + 3 * var1
+        global_step = tf.Variable(tf.zeros([], tf.int64), name='global_step')
+        mom_op = tf.train.MomentumOptimizer(learning_rate=2.0, momentum=0.9,
+            use_nesterov=True)
+        opt_op = mom_op.minimize(cost, global_step, [var0, var1])
+        tf.initialize_all_variables().run()
+        for t in range(1, 5):
+          opt_op.run()
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(var0_np,
+              accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+              accum1_np, 3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
+  def testSparseNesterovMomentum(self):
+    for dtype in [tf.float32, tf.float64]:
+      with self.test_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        grads = []
+        for t in range(1, 5):
+          grads.append(var0_np * 10)
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(var0_np,
+              accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+              accum1_np, 3, 2.0, 0.9)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        var0 = tf.Variable(var0_np)
+        var1 = tf.Variable(var1_np)
+        loss = 5 * var0 * var0 + 3 * var1
+        mom_op = tf.train.MomentumOptimizer(learning_rate=2.0, momentum=0.9,
+            use_nesterov=True)
+        x_feed = tf.placeholder(dtype)
+        y_feed = tf.IndexedSlices(x_feed,tf.constant([0, 1]),tf.constant([2]))
+        grads_and_vars = [(y_feed, var0),
+            (tf.constant([3.0,3.0],dtype=dtype), var1)]
+        opt_update = mom_op.apply_gradients(grads_and_vars)
+        tf.initialize_all_variables().run()
+        for t in range(1, 5):
+          opt_update.run(feed_dict = {x_feed:grads[t - 1]})
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(var0_np,
+              accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+              accum1_np, 3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
  def testTensorLearningRateAndMomentum(self):
    for dtype in [tf.half, tf.float32, tf.float64]:
      with self.test_session():
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@ -314,8 +314,17 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
  if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier, (const void**)&cuda_driver_info)) {
    // NOTE: OSX CUDA driver does not currently store the same driver version
    // in kCFBundleVersionKey as is returned by cuDriverGetVersion
-    const char * version = CFStringGetCStringPtr((CFStringRef)CFDictionaryGetValue(cuda_driver_info, kCFBundleVersionKey), kCFStringEncodingUTF8);
    CFRelease(kext_infos);
+    const CFStringRef str = (CFStringRef)CFDictionaryGetValue(
+        cuda_driver_info, kCFBundleVersionKey);
+    const char *version = CFStringGetCStringPtr(str, kCFStringEncodingUTF8);
+
+    // version can be NULL in which case treat it as empty string
+    // see
+    // https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112
+    if (version == NULL) {
+      return StringToDriverVersion("");
+    }
    return StringToDriverVersion(version);
  }
  CFRelease(kext_infos);
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@ -54,6 +54,15 @@ NarrowT CheckedNarrowing(const WideT& wide) {
  return narrow;
 }

+// Returns the "Compatibility" version number from the CuDNN version number.
+// This is the number that tries to indicate ABI compatibility.
+//
+// For example, if cudnn_version is 5107, the compatibility version
+// number will be 5100.
+size_t cudnnCompatibilityVersion(size_t cudnn_version) {
+  return (cudnn_version / 100) * 100;
+}
+
 }  // namespace

 namespace perftools {
@ -139,13 +148,6 @@ size_t cudnnGetVersion() {
  return callable();
 }

-// Returns whether the currently loaded cuDNN version is R2.
-bool IsCudnnR2() {
-  static auto version = cudnnGetVersion();
-  DCHECK_GE(version, 2000);
-  return version < 3000;
-}
-
 #define PERFTOOLS_GPUTOOLS_CUDNN_WRAP(__name)                        \
  struct DynLoadShim__##__name {                                     \
    static const char* kName;                                        \
@ -197,26 +199,13 @@ bool IsCudnnR2() {
  __macro(cudnnPoolingForward)                            \
  __macro(cudnnPoolingBackward)                           \
  __macro(cudnnLRNCrossChannelForward)                    \
-  __macro(cudnnLRNCrossChannelBackward)
-// clang-format on
-
-CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
-
-// clang-format off
-#if CUDNN_VERSION >= 4000 && CUDNN_VERSION < 5000
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
-  __macro(cudnnAddTensor_v2)                              \
-  __macro(cudnnConvolutionBackwardData_v2)                \
-  __macro(cudnnConvolutionBackwardFilter_v2)
-#else
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
+  __macro(cudnnLRNCrossChannelBackward)                   \
  __macro(cudnnAddTensor)                                 \
  __macro(cudnnConvolutionBackwardData)                   \
  __macro(cudnnConvolutionBackwardFilter)
-#endif
 // clang-format on

-CUDNN_DNN_ROUTINE_EACH_R2(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)

 // APIs available after R3:
 #if CUDNN_VERSION >= 3000
@ -340,15 +329,21 @@ port::Status CudnnSupport::Init() {
    // Check whether loaded version of CuDNN matches what the source
    // was built with.
    size_t loaded_version = dynload::cudnnGetVersion();
-    bool library_loaded_matches_source = (loaded_version == CUDNN_VERSION);
+    size_t loaded_compat_version = cudnnCompatibilityVersion(loaded_version);
+    size_t compiled_compat_version = cudnnCompatibilityVersion(CUDNN_VERSION);
+    bool library_loaded_matches_source =
+        (loaded_compat_version == compiled_compat_version);
    if (!library_loaded_matches_source) {
      const string error =
-          port::StrCat("Loaded cudnn library: ", loaded_version,
-                       " but source was compiled against ", CUDNN_VERSION,
-                       ".  If using a binary install, upgrade your cudnn "
+          port::StrCat("Loaded runtime CuDNN library: ", loaded_version,
+                       " (compatibility version ", loaded_compat_version,
+                       ") but source was compiled with ", CUDNN_VERSION,
+                       " (compatibility version ", compiled_compat_version,
+                       ").  If using a binary install, upgrade your CuDNN "
                       "library to match.  If building from sources, "
-                       "make sure the library loaded matches the "
-                       "version you specified during compile configuration.");
+                       "make sure the library loaded at runtime matches a "
+                       "compatible version specified during compile "
+                       "configuration.");
      LOG(ERROR) << error;
      return port::Status{port::error::INTERNAL, error};
    }
@ -1109,31 +1104,6 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
                                   CUDNN_DATA_FLOAT};

-#if CUDNN_VERSION < 5000
-#if CUDNN_VERSION >= 3000
-  if (dynload::IsCudnnR2()) {
-#endif
-#if CUDNN_VERSION >= 4000
-    status = dynload::cudnnConvolutionBackwardData_v2(
-#else
-  status = dynload::cudnnConvolutionBackwardData(
-#endif
-        parent_, ToHandle(dnn_handle_), &alpha, filter.handle(),
-        filter_data.opaque(), out_back_nd.handle(),
-        backward_output_data.opaque(), conv.handle(), &beta,
-        in_back_nd.handle(), backward_input_data->opaque());
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "failed to enqueue convolution on stream: "
-                 << ToString(status);
-      return false;
-    }
-    return true;
-#if CUDNN_VERSION >= 3000
-  }
-#endif
-#endif
-
-#if CUDNN_VERSION >= 3000
  const bool is_profiling = output_profile_result != nullptr;
  cudnnConvolutionBwdDataAlgo_t algo;
  DeviceMemory<uint8> scratch;
@ -1284,7 +1254,6 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
    return false;
  }
  return true;
-#endif
 }

 bool CudnnSupport::DoConvolveBackwardData(
@ -1369,31 +1338,6 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
      CUDNN_DATA_FLOAT};

-#if CUDNN_VERSION < 5000
-#if CUDNN_VERSION >= 3000
-  if (dynload::IsCudnnR2()) {
-#endif
-#if CUDNN_VERSION >= 4000
-    status = dynload::cudnnConvolutionBackwardFilter_v2(
-#else
-  status = dynload::cudnnConvolutionBackwardFilter(
-#endif
-        parent_, ToHandle(dnn_handle_), &alpha, input_nd.handle(),
-        input_data.opaque(), out_back_nd.handle(),
-        backward_output_data.opaque(), conv.handle(), &beta, filter.handle(),
-        backward_filter_data->opaque());
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "failed to enqueue convolution on stream: "
-                 << ToString(status);
-      return false;
-    }
-    return true;
-#if CUDNN_VERSION >= 3000
-  }
-#endif
-#endif
-
-#if CUDNN_VERSION >= 3000
  const bool is_profiling = output_profile_result != nullptr;
  cudnnConvolutionBwdFilterAlgo_t algo;
  DeviceMemory<uint8> scratch;
@ -1544,7 +1488,6 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
    return false;
  }
  return true;
-#endif
 }

 bool CudnnSupport::DoConvolveBackwardFilter(
@ -1824,33 +1767,15 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,

  const float alpha = 1.0f;
  const float beta = 1.0f;
-#if CUDNN_VERSION >= 3000
-  if (dynload::IsCudnnR2()) {
-#endif

-#if CUDNN_VERSION < 5000
-#if CUDNN_VERSION >= 4000
-    status = dynload::cudnnAddTensor_v2(
-#else
-    status = dynload::cudnnAddTensor(
-#endif
-        parent_, ToHandle(dnn_handle_), CUDNN_ADD_SAME_C, &alpha,
-        bias_descriptor.handle(), biases.opaque(), &beta,
-        input_descriptor.handle(), output_data->opaque());
-#endif  // CUDNN_VERSION < 5000
-
-#if CUDNN_VERSION >= 3000
-  } else {
 #if CUDNN_VERSION >= 5000
-    status = dynload::cudnnAddTensor(
+  status = dynload::cudnnAddTensor(
 #else
-    status = dynload::cudnnAddTensor_v3(
-#endif
-        parent_, ToHandle(dnn_handle_), &alpha, bias_descriptor.handle(),
-        biases.opaque(), &beta, input_descriptor.handle(),
-        output_data->opaque());
-  }
+  status = dynload::cudnnAddTensor_v3(
 #endif
+      parent_, ToHandle(dnn_handle_), &alpha, bias_descriptor.handle(),
+      biases.opaque(), &beta, input_descriptor.handle(),
+      output_data->opaque());

  if (status != CUDNN_STATUS_SUCCESS) {
    LOG(ERROR) << "stream " << stream << " could not enqueue bias addition.";
--- a/tensorflow/tensorboard/BUILD
+++ b/tensorflow/tensorboard/BUILD
@ -10,10 +10,10 @@ exports_files(["LICENSE"])
 filegroup(
    name = "frontend",
    srcs = [
+        "TAG",
        "dist/index.html",
        "dist/tf-tensorboard.html",
-        "TAG",
-        "//tensorflow/tensorboard/bower:bower",
+        "//tensorflow/tensorboard/bower",
        "//tensorflow/tensorboard/lib:all_files",
    ],
 )
--- a/tensorflow/tensorboard/README.md
+++ b/tensorflow/tensorboard/README.md
@ -21,7 +21,7 @@ directory by creating a `SummaryWriter`:
 ``` python
 # sess.graph_def is the graph definition; that enables the Graph Visualizer.

-summary_writer = tf.train.SummaryWriter('/path/to/logs', sess.graph_def)
+summary_writer = tf.train.SummaryWriter('/path/to/logs', sess.graph)
 ```

 For more details, see [this
@ -115,9 +115,9 @@ For example, here is a well-organized TensorBoard log directory, with two runs,

 # The Visualizations

-### Scalar Dashboard
+### Events Dashboard

-TensorBoard's Scalar Dashboard visualizes scalar statistics that vary over time;
+TensorBoard's Events Dashboard visualizes scalar statistics that vary over time;
 for example, you might want to track the model's loss or learning rate. As
 described in *Key Concepts*, you can compare multiple runs, and the data is
 organized by tag. The line charts have the following interactions:
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@ -49,10 +49,11 @@
 # to run.
 #

-# Constants:
 # Fixed naming patterns for wheel (.whl) files given different python versions
-declare -A WHL_TAGS
-WHL_TAGS=(["2.7"]="cp27-none" ["3.4"]="cp34-cp34m" ["3.5"]="cp35-cp35m")
+if [[ $(uname) == "Linux" ]]; then
+  declare -A WHL_TAGS
+  WHL_TAGS=(["2.7"]="cp27-none" ["3.4"]="cp34-cp34m" ["3.5"]="cp35-cp35m")
+fi


 INSTALL_EXTRA_PIP_PACKAGES=${TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES}
--- a/tensorflow/tools/ci_build/builds/test_installation.sh
+++ b/tensorflow/tools/ci_build/builds/test_installation.sh
@ -243,6 +243,8 @@ rm -rf ${PY_TEST_DIR}/tensorflow/core/lib/jpeg
 cp -r tensorflow/core/lib/jpeg ${PY_TEST_DIR}/tensorflow/core/lib
 rm -rf ${PY_TEST_DIR}/tensorflow/core/lib/png
 cp -r tensorflow/core/lib/png ${PY_TEST_DIR}/tensorflow/core/lib
+rm -rf ${PY_TEST_DIR}/tensorflow/core/lib/gif
+cp -r tensorflow/core/lib/gif ${PY_TEST_DIR}/tensorflow/core/lib

 # Copy test data from tensorflow/contrib/ffmpeg

--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@ -174,24 +174,57 @@ function get_cuda_capability_version() {
  fi
 }

-# Process container type
+# Container type, e.g., CPU, GPU
 CTYPE=${TF_BUILD_CONTAINER_TYPE}
+
+# Determine if Docker is available
 OPT_FLAG=""
+if [[ -z "$(which docker)" ]]; then
+  DO_DOCKER=0
+
+  echo "It appears that Docker is not available on this system. "\
+"Will perform build without Docker."
+  echo "Also, the additional option flags will be applied to the build:"
+  echo "  ${NO_DOCKER_OPT_FLAG}"
+  MAIN_CMD="${NO_DOCKER_MAIN_CMD} ${CTYPE}"
+  OPT_FLAG="${OPT_FLAG} ${NO_DOCKER_OPT_FLAG}"
+fi
+
+# Process container type
 if [[ ${CTYPE} == "cpu" ]]; then
  :
 elif [[ ${CTYPE} == "gpu" ]]; then
-  OPT_FLAG="--config=cuda"
+  OPT_FLAG="${OPT_FLAG} --config=cuda"

-  # Attempt to determine CUDA capability version and use it
-  if [[ "${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS}" != \
-        *"TF_CUDA_COMPUTE_CAPABILITIES="* ]]; then
-    CUDA_CAPA_VER=$(get_cuda_capability_version)
-    if [[ ! -z ${CUDA_CAPA_VER} ]]; then
-      echo "TF_CUDA_COMPUTE_CAPABILITIES is not set."
-      echo "Using CUDA capability version from deviceQuery: ${CUDA_CAPA_VER}"
+  # Attempt to determine CUDA capability version automatically and use it if
+  # CUDA capability version is not specified by the environment variables.
+  CUDA_CAPA_VER=$(get_cuda_capability_version)
+
+  if [[ ! -z ${CUDA_CAPA_VER} ]]; then
+    AUTO_CUDA_CAPA_VER=0
+    if [[ ${DO_DOCKER} == "1" ]] && \
+       [[ "${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS}" != \
+           *"TF_CUDA_COMPUTE_CAPABILITIES="* ]]; then
+      AUTO_CUDA_CAPA_VER=1
      TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS=\
 "${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS} -e "\
 "TF_CUDA_COMPUTE_CAPABILITIES=${CUDA_CAPA_VER}"
+
+      echo "Docker GPU build: TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS="\
+"\"${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS}\""
+    elif [[ ${DO_DOCKER} == "0" ]] && \
+         [[ -z "${TF_CUDA_COMPUTE_CAPABILITIES}" ]]; then
+      AUTO_CUDA_CAPA_VER=1
+      TF_CUDA_COMPUTE_CAPABILITIES="${CUDA_CAPA_VER}"
+
+      echo "Non-Docker GPU build: TF_CUDA_COMPUTE_CAPABILITIES="\
+"\"${TF_CUDA_COMPUTE_CAPABILITIES}\""
+    fi
+
+    if [[ ${AUTO_CUDA_CAPA_VER} == "1" ]]; then
+      echo "TF_CUDA_COMPUTE_CAPABILITIES is not set:"
+      echo "Using CUDA capability version from deviceQuery: ${CUDA_CAPA_VER}"
+      echo ""
    fi
  fi
 elif [[ ${CTYPE} == "android" ]]; then
@ -203,19 +236,6 @@ fi

 EXTRA_PARAMS=""

-# Determine if Docker is available
-if [[ -z "$(which docker)" ]]; then
-  DO_DOCKER=0
-
-  echo "It appears that Docker is not available on this system. "\
-"Will perform build without Docker."
-  echo "Also, the additional option flags will be applied to the build:"
-  echo "  ${NO_DOCKER_OPT_FLAG}"
-  MAIN_CMD="${NO_DOCKER_MAIN_CMD} ${CTYPE}"
-  OPT_FLAG="${OPT_FLAG} ${NO_DOCKER_OPT_FLAG}"
-
-fi
-
 # Determine if this is a benchmarks job
 RUN_BENCHMARKS=0
 if [[ ! -z "${TF_BUILD_RUN_BENCHMARKS}" ]] &&
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@ -80,7 +80,7 @@ RUN mkdir /bazel && \

 # Download and build TensorFlow.

-RUN git clone --recursive https://github.com/tensorflow/tensorflow.git && \
+RUN git clone -b r0.9 --recursive --recurse-submodules https://github.com/tensorflow/tensorflow.git && \
    cd tensorflow && \
    git checkout r0.9
 WORKDIR /tensorflow
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@ -16,7 +16,9 @@ RUN ./install_google_cloud_sdk.bash --disable-prompts --install-dir=/var/gcloud

 # Install nightly TensorFlow pip
 RUN pip install \
-   http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-py2-none-any.whl
+   http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-cp27-none-linux_x86_64.whl

 # Copy test files
-COPY python/gcs_smoke.py /
+RUN mkdir -p /gcs-smoke/python
+COPY gcs_smoke_wrapper.sh /gcs-smoke/
+COPY python/gcs_smoke.py /gcs-smoke/python/
--- a/tensorflow/tools/gcs_test/gcs_smoke.sh
+++ b/tensorflow/tools/gcs_test/gcs_smoke.sh
@ -67,30 +67,8 @@ docker build --no-cache \

 # Run the docker image with the GCS key file mapped and the gcloud-required
 # environment variables set.
-LOG_FILE="/tmp/tf-gcs-test.log"
-rm -rf ${LOG_FILE}
-
 docker run --rm \
    -v ${GCLOUD_JSON_KEY_PATH}:/gcloud-key.json \
    -e "GOOGLE_APPLICATION_CREDENTIALS=/gcloud-key.json" \
    "${DOCKER_IMG}" \
-    python /gcs_smoke.py --gcs_bucket_url="${GCS_BUCKET_URL}" \
-    2>&1 > "${LOG_FILE}"
-
-if [[ $? != "0" ]]; then
-  cat ${LOG_FILE}
-  die "FAIL: End-to-end test of GCS access from TensorFlow failed."
-fi
-
-cat ${LOG_FILE}
-echo ""
-
-# Clean up the newly created tfrecord file in GCS bucket
-NEW_TFREC_URL=$(grep "Using input path" "${LOG_FILE}" | \
-                awk '{print $NF}')
-if [[ -z ${NEW_TFREC_URL} ]]; then
-  die "FAIL: Unable to determine the URL to the new tfrecord file in GCS"
-fi
-gsutil rm "${NEW_TFREC_URL}" && \
-    echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}" || \
-    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
+    /gcs-smoke/gcs_smoke_wrapper.sh "${GCS_BUCKET_URL}"
--- a/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
+++ b/tensorflow/tools/gcs_test/gcs_smoke_wrapper.sh
@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# In-container wrapper for GCS smoke test.
+#
+# This script invokes gcs_smoke.py and performs tear down afterwards.
+#
+# Usage:
+#   gcs_smoke_wrapper.sh <GCS_BUCKET_URL>
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Helper function: Exit on failure.
+die () {
+  echo $@
+  exit 1
+}
+
+print_usage() {
+  echo "Usage: gcs_smoke_wrapper.sh <GCS_BUCKET_URL>"
+  echo ""
+}
+
+# Sanity check on command-line arguments.
+GCS_BUCKET_URL=$1
+if [[ -z "${GCS_BUCKET_URL}" ]]; then
+  print_usage
+  die "ERROR: Command-line argument GCS_BUCKET_URL is not supplied"
+fi
+
+# Check that gcloud and gsutil binaries are available.
+GCLOUD_BIN="/var/gcloud/google-cloud-sdk/bin/gcloud"
+if [[ ! -f "${GCLOUD_BIN}" ]]; then
+  die "ERROR: Unable to find gcloud at path ${GCLOUD_BIN}"
+fi
+
+GSUTIL_BIN="/var/gcloud/google-cloud-sdk/bin/gsutil"
+if [[ ! -f "${GSUTIL_BIN}" ]]; then
+  die "ERROR: Unable to find gsutil at path ${GSUTIL_BIN}"
+fi
+
+# Check environment variable for gcloud credentials
+if [[ -z "${GOOGLE_APPLICATION_CREDENTIALS}" ]]; then
+  die "ERROR: Required gcloud environment variable "\
+"${GOOGLE_APPLICATION_CREDENTIALS} is not set."
+fi
+
+# Locate main Python file
+GCS_SMOKE_PY="${SCRIPT_DIR}/python/gcs_smoke.py"
+if [[ ! -f "${GCS_SMOKE_PY}" ]]; then
+  die "ERROR: Unable to find Python file at ${GCS_SMOKE_PY}"
+fi
+
+
+LOG_FILE="/tmp/tf-gcs-test.log"
+rm -rf ${LOG_FILE} || \
+    die "ERROR: Failed to remove existing log file ${LOG_FILE}"
+
+# Invoke main Python file
+python "${GCS_SMOKE_PY}" --gcs_bucket_url="${GCS_BUCKET_URL}" \
+    2>&1 > "${LOG_FILE}"
+
+if [[ $? != "0" ]]; then
+  cat ${LOG_FILE}
+  die "FAIL: End-to-end test of GCS access from TensorFlow failed."
+fi
+
+cat ${LOG_FILE}
+echo ""
+
+
+# Clean up the newly created tfrecord file in GCS bucket.
+# First, activate gcloud service account
+"${GCLOUD_BIN}" auth activate-service-account \
+    --key-file "${GOOGLE_APPLICATION_CREDENTIALS}" || \
+    die "ERROR: Failed to activate gcloud service account with JSON key file"
+
+NEW_TFREC_URL=$(grep "Using input path" "${LOG_FILE}" | \
+                awk '{print $NF}')
+if [[ -z ${NEW_TFREC_URL} ]]; then
+  die "FAIL: Unable to determine the URL to the new tfrecord file in GCS"
+fi
+"${GSUTIL_BIN}" rm "${NEW_TFREC_URL}" && \
+    echo "Cleaned up new tfrecord file in GCS: ${NEW_TFREC_URL}" || \
+    die "FAIL: Unable to clean up new tfrecord file in GCS: ${NEW_TFREC_URL}"
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@ -8,8 +8,8 @@ load("//tensorflow:tensorflow.bzl", "transitive_hdrs")
 transitive_hdrs(
    name = "other_headers",
    deps = [
-        "//third_party/eigen3",
        "//tensorflow/core:protos_all_cc",
+        "//third_party/eigen3",
    ],
 )

--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@ -108,21 +108,16 @@ class InstallHeaders(Command):
    # directories for -I
    install_dir = re.sub('/google/protobuf/src', '', install_dir)

-    # Copy eigen code into tensorflow/include,
-    # tensorflow/include/external/eigen_archive/eigen-eigen-<revision>,
-    # and tensorflow/include/eigen-eigen-<revision>.
+    # Copy eigen code into tensorflow/include.
    # A symlink would do, but the wheel file that gets created ignores
    # symlink within the directory hierarchy.
    # NOTE(keveman): Figure out how to customize bdist_wheel package so
    # we can do the symlink.
-    if re.search(r'(external/eigen_archive/eigen-eigen-\w+)', install_dir):
-      extra_dirs = [re.sub('/external/eigen_archive', '', install_dir),
-                    re.sub(r'external/eigen_archive/eigen-eigen-\w+', '',
-                           install_dir)]
-      for extra_dir in extra_dirs:
-        if not os.path.exists(extra_dir):
-          self.mkpath(extra_dir)
-        self.copy_file(header, extra_dir)
+    if 'external/eigen_archive/' in install_dir:
+      extra_dir = install_dir.replace('external/eigen_archive', '')
+      if not os.path.exists(extra_dir):
+        self.mkpath(extra_dir)
+      self.copy_file(header, extra_dir)

    if not os.path.exists(install_dir):
      self.mkpath(install_dir)
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -4,10 +4,17 @@
 # within the workspace (e.g. "tensorflow/"), and tf_repo_name is the name of the
 # local_repository rule (e.g. "@tf").
 def tf_workspace(path_prefix = "", tf_repo_name = ""):
+
+  # These lines need to be changed when updating Eigen. They are parsed from
+  # this file by the cmake and make builds to determine the eigen version and hash.
+  eigen_version = "b4fa9622b809"
+  eigen_sha256 = "2862840c2de9c0473a4ef20f8678949ae89ab25965352ee53329e63ba46cec62"
+
  native.new_http_archive(
    name = "eigen_archive",
-    url = "https://bitbucket.org/eigen/eigen/get/b4fa9622b809.tar.gz",
-    sha256 = "2862840c2de9c0473a4ef20f8678949ae89ab25965352ee53329e63ba46cec62",
+    url = "https://bitbucket.org/eigen/eigen/get/" + eigen_version + ".tar.gz",
+    sha256 = eigen_sha256,
+    strip_prefix = "eigen-eigen-" + eigen_version,
    build_file = path_prefix + "eigen.BUILD",
  )

@ -56,6 +63,13 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
    build_file = path_prefix + "png.BUILD",
  )

+  native.new_http_archive(
+    name = "gif_archive",
+    url = "http://ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
+    sha256 = "34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1",
+    build_file = path_prefix + "gif.BUILD",
+  )
+
  native.new_http_archive(
    name = "six_archive",
    url = "https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz#md5=34eed507548117b2ab523ab14b2f8b55",
@ -92,8 +106,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
  )

  native.bind(
-      name = "python_headers",
-      actual = tf_repo_name + "//util/python:python_headers",
+    name = "python_headers",
+    actual = tf_repo_name + "//util/python:python_headers",
  )

  # grpc expects //external:protobuf_clib and //external:protobuf_compiler
@ -141,9 +155,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
  )

  native.git_repository(
-      name = "boringssl_git",
-      remote = "https://github.com/google/boringssl.git",
-      commit = "bbcaa15b0647816b9a1a9b9e0d209cd6712f0105",  # 2016-07-11
+    name = "boringssl_git",
+    remote = "https://github.com/google/boringssl.git",
+    commit = "bbcaa15b0647816b9a1a9b9e0d209cd6712f0105",  # 2016-07-11
  )

  native.new_git_repository(
--- a/third_party/avro/BUILD
+++ b/third_party/avro/BUILD
@ -1,4 +1,3 @@
 package(default_visibility = ["//visibility:public"])

 licenses(["notice"])  # Apache 2.0
-
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@ -13,7 +13,6 @@ cc_library(
        "unsupported/Eigen/CXX11/FixedPoint",
        "unsupported/Eigen/CXX11/src/FixedPoint/*.h",
    ]),
-    includes = ["."],
    visibility = ["//visibility:public"],
    deps = [
        "@eigen_archive//:eigen",
--- a/third_party/eigen3/Eigen/Cholesky
+++ b/third_party/eigen3/Eigen/Cholesky
@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/Cholesky"
+#include "Eigen/Cholesky"
--- a/third_party/eigen3/Eigen/Core
+++ b/third_party/eigen3/Eigen/Core
@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/Core"
+#include "Eigen/Core"
--- a/third_party/eigen3/Eigen/Eigenvalues
+++ b/third_party/eigen3/Eigen/Eigenvalues
@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/Eigenvalues"
+#include "Eigen/Eigenvalues"
--- a/third_party/eigen3/Eigen/LU
+++ b/third_party/eigen3/Eigen/LU
@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/LU"
+#include "Eigen/LU"
--- a/third_party/eigen3/Eigen/QR
+++ b/third_party/eigen3/Eigen/QR
@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/Eigen/QR"
+#include "Eigen/QR"
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@ -1 +1 @@
-#include "eigen-eigen-b4fa9622b809/unsupported/Eigen/CXX11/Tensor"
+#include "unsupported/Eigen/CXX11/Tensor"
--- a/Show More
+++ b/Show More