Merge changes from github.

Additionally: - change single quotes to double quotes to make path rewriting easier - guard windows lib reference with PLATFORM_WINDOWS - fixed failing kmeans test Change: 141515942
2016-12-08 20:05:49 -08:00 · 2016-12-08 20:05:49 -08:00 · 1cb96893a6
commit 1cb96893a6
parent 90b72f4b2f
256 changed files with 6163 additions and 2346 deletions
--- a/README.md
+++ b/README.md
@ -3,9 +3,9 @@
 </div>
 -----------------

-|  **`Linux CPU`**   |  **`Linux GPU PIP`** | **`Mac OS CPU`** |  **`Android`** |
-|-------------------|----------------------|------------------|----------------|
-| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-gpu_pip)](https://ci.tensorflow.org/job/tensorflow-master-gpu_pip) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) |
+| **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
+|-----------------|---------------------|------------------|-------------------|---------------|
+| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) |

 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  Nodes in the graph represent mathematical operations, while
@ -33,10 +33,10 @@ and discussion.**

 People who are a little more adventurous can also try our nightly binaries:

-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.11.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.11.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.11.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.11.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.11.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.12.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.12.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.12.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.12.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.12.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.12.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.12.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.12.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.12.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-0.12.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
 * [Android](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/lastSuccessfulBuild/artifact/bazel-out/local_linux/bin/tensorflow/examples/android/tensorflow_demo.apk) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/))

 #### *Try your first TensorFlow program*
--- a/RELEASE.md
+++ b/RELEASE.md
@ -6,9 +6,49 @@
  semantics. tf.div is renamed to tf.division. New operators tf.truncatediv and
  tf.truncatemod are available for achieving the previous C++ (truncation)
  division/modulus semantics.
+
+# Release 0.12.0
+
+## Major Features and Improvements
+
+* TensorFlow now builds and runs on Microsoft Windows (tested on Windows 10,
+  Windows 7, and Windows Server 2016). Supported languages include Python (via a
+  pip package) and C++. CUDA 8.0 and cuDNN 5.1 are supported for GPU
+  acceleration. Known limitations include: It is not currently possible to load
+  a custom op library. The GCS and HDFS file systems are not currently
+  supported. The following ops are not currently implemented:
+  DepthwiseConv2dNative, DepthwiseConv2dNativeBackpropFilter,
+  DepthwiseConv2dNativeBackpropInput, Dequantize, Digamma, Erf, Erfc, Igamma,
+  Igammac, Lgamma, Polygamma, QuantizeAndDequantize, QuantizedAvgPool,
+  QuantizedBatchNomWithGlobalNormalization, QuantizedBiasAdd, QuantizedConcat,
+  QuantizedConv2D, QuantizedMatmul, QuantizedMaxPool,
+  QuantizeDownAndShrinkRange, QuantizedRelu, QuantizedRelu6, QuantizedReshape,
+  QuantizeV2, RequantizationRange, and Requantize.
+* Go: Experimental API in Go to create and execute graphs
+  (https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go)
+* New checkpoint format becomes the default in `tf.train.Saver`. Old V1
+  checkpoints continue to be readable; controlled by the `write_version`
+  argument, `tf.train.Saver` now by default writes out in the new V2
+  format. It significantly reduces the peak memory required and latency
+  incurred during restore.
+* Added a new library for library of matrix-free (iterative) solvers for linear
+  equations, linear least-squares, eigenvalues and singular values in
+  tensorflow/contrib/solvers. Initial version has lanczos bidiagonalization,
+  conjugate gradients and CGLS.
+* Added gradients for `matrix_solve_ls` and `self_adjoint_eig`.
+* Large cleanup to add second order gradient for ops with C++ gradients and
+  improve existing gradients such that most ops can now be differentiated
+  multiple times.
+* Added a solver for ordinary differential equations,
+  `tf.contrib.integrate.odeint`.
+* New contrib module for tensors with named axes, `tf.contrib.labeled_tensor`.
+* Visualization of embeddings in TensorBoard.
+
+## Breaking Changes to the API
+
 * `BusAdjacency` enum replaced with a protocol buffer `DeviceLocality`.  PCI bus
-indexing now starts from 1 instead of 0, and bus_id==0 is used where previously
-BUS_ANY was used.
+  indexing now starts from 1 instead of 0, and bus_id==0 is used where
+  previously BUS_ANY was used.
 * `Env::FileExists` and `FileSystem::FileExists` now return a tensorflow::Status
  intead of a bool. Any callers to this function can be converted to a bool
  by adding .ok() to the call.
@ -16,8 +56,9 @@ BUS_ANY was used.
  indicating its preferred use in language bindings for TensorFlow.
  What was previously `TF_Session` has been renamed to `TF_DeprecatedSession`.
 * Renamed TF_Port to TF_Output in the C API.
-* Removes RegisterShape from public API. Use C++ shape function registration
-  instead.
+* Removes RegisterShape from public API. Use C++ shape function registration instead.
+  indexing now starts from 1 instead of 0, and `bus_id==0` is used where
+  previously `BUS_ANY` was used.
 * Most RNN cells and RNN functions now use different variable scopes to be
  consistent with layers (`tf.contrib.layers`).  This means old checkpoints
  written using this code will not load after this change without providing
@ -27,6 +68,65 @@ BUS_ANY was used.
 * Deprecated tf.select op. tf.where should be used instead.
 * `SparseTensor.shape` has been renamed to `SparseTensor.dense_shape`.  Same for
  `SparseTensorValue.shape`.
+* `Env::FileExists` and `FileSystem::FileExists` now return a
+  `tensorflow::Status` intead of a bool. Any callers to this function can be
+  converted to a bool by adding `.ok()` to the call.
+* C API: Type `TF_SessionWithGraph` has been renamed to `TF_Session`, indicating
+  its preferred use in language bindings for TensorFlow. What was previously
+  `TF_Session` has been renamed to `TF_DeprecatedSession`.
+* C API: Renamed `TF_Port` to `TF_Output`.
+* C API: The caller retains ownership of `TF_Tensor` objects provided to
+  `TF_Run`, `TF_SessionRun`, `TF_SetAttrTensor` etc.
+* Renamed `tf.image.per_image_whitening()` to
+  `tf.image.per_image_standardization()`
+* Move Summary protobuf constructors to `tf.summary` submodule.
+* Deprecate `histogram_summary`, `audio_summary`, `scalar_summary`,
+  `image_summary`, `merge_summary`, and `merge_all_summaries`.
+* Combined `batch_*` and regular version of linear algebra and FFT ops. The
+  regular op now handles batches as well. All `batch_*` Python interfaces were
+  removed.
+* `tf.all_variables`, `tf.VARIABLES` and `tf.initialize_all_variables` renamed
+  to `tf.global_variables`, `tf.GLOBAL_VARIABLES` and
+  `tf.global_variables_initializer` respectively.
+
+## Bug Fixes and Other Changes
+
+* Use threadsafe version of `lgamma` function.
+* Fix `tf.sqrt` handling of negative arguments.
+* Fixed bug causing incorrect number of threads to be used for multi-threaded
+  benchmarks.
+* Performance optimizations for `batch_matmul` on multi-core CPUs.
+* Improve trace, `matrix_set_diag`, `matrix_diag_part` and their gradients to
+  work for rectangular matrices.
+* Support for SVD of complex valued matrices.
+
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+@a7744hsc, Abhi Agg, @admcrae, Adriano Carmezim, Aki Sukegawa, Alex Kendall,
+Alexander Rosenberg Johansen, @amcrae, Amlan Kar, Andre Simpelo, Andreas Eberle,
+Andrew Hundt, Arnaud Lenglet, @b0noI, Balachander Ramachandran, Ben Barsdell,
+Ben Guidarelli, Benjamin Mularczyk, Burness Duan, @c0g, Changming Sun,
+@chanis, Corey Wharton, Dan J, Daniel Trebbien, Darren Garvey, David Brailovsky,
+David Jones, Di Zeng, @DjangoPeng, Dr. Kashif Rasul, @drag0, Fabrizio (Misto)
+Milo, FabríCio Ceschin, @fp, @Ghedeon, @guschmue, Gökçen Eraslan, Haosdent
+Huang, Haroen Viaene, Harold Cooper, Henrik Holst, @hoangmit, Ivan Ukhov, Javier
+Dehesa, Jingtian Peng, Jithin Odattu, Joan Pastor, Johan Mathe, Johannes Mayer,
+Jongwook Choi, Justus Schwabedal, Kai Wolf, Kamil Hryniewicz, Kamran Amini,
+Karen Brems, Karl Lattimer, @kborer, Ken Shirriff, Kevin Rose, Larissa Laich,
+Laurent Mazare, Leonard Lee, Liang-Chi Hsieh, Liangliang He, Luke Iwanski,
+Marek Kolodziej, Moustafa Alzantot, @MrQianjinsi, @nagachika, Neil Han, Nick
+Meehan, Niels Ole Salscheider, Nikhil Mishra, @nschuc, Ondrej Skopek, OndřEj
+Filip, @OscarDPan, Pablo Moyano, Przemyslaw Tredak, @qitaishui, @Quarazy,
+@raix852, Philipp Helo, Sam Abrahams, @SriramRamesh, Till Hoffmann, Tushar Soni,
+@tvn, @tyfkda, Uwe Schmidt, Victor Villas, Vit Stepanovs, Vladislav Gubarev,
+@wujingyue, Xuesong Yang, Yi Liu, Yilei Yang, @youyou3, Yuan (Terry) Tang,
+Yuming Wang, Zafar Takhirov, @zhongyuk, Ziming Dong, @guotong1988
+
+We are also grateful to all who filed issues or helped resolve them, asked and
+answered questions, and were part of inspiring discussions.

 # Release 0.11.0

--- a/63
+++ b/63
@ -52,7 +52,6 @@ done
 if is_windows; then
  TF_NEED_GCP=0
  TF_NEED_HDFS=0
-  TF_NEED_CUDA=0
  TF_NEED_OPENCL=0
 fi

@ -80,10 +79,10 @@ if [ "$TF_NEED_GCP" == "1" ]; then
  fi

  # Update Bazel build configuration.
-  perl -pi -e "s,WITH_GCP_SUPPORT = (False|True),WITH_GCP_SUPPORT = True,s" tensorflow/core/platform/default/build_config.bzl
+  sed -i -e "s/WITH_GCP_SUPPORT = False/WITH_GCP_SUPPORT = True/" tensorflow/core/platform/default/build_config.bzl
 else
  # Update Bazel build configuration.
-  perl -pi -e "s,WITH_GCP_SUPPORT = (False|True),WITH_GCP_SUPPORT = False,s" tensorflow/core/platform/default/build_config.bzl
+  sed -i -e "s/WITH_GCP_SUPPORT = True/WITH_GCP_SUPPORT = False/" tensorflow/core/platform/default/build_config.bzl
 fi

 while [ "$TF_NEED_HDFS" == "" ]; do
@ -102,10 +101,10 @@ done

 if [ "$TF_NEED_HDFS" == "1" ]; then
  # Update Bazel build configuration.
-  perl -pi -e "s,WITH_HDFS_SUPPORT = (False|True),WITH_HDFS_SUPPORT = True,s" tensorflow/core/platform/default/build_config.bzl
+  sed -i -e "s/WITH_HDFS_SUPPORT = False/WITH_HDFS_SUPPORT = True/" tensorflow/core/platform/default/build_config.bzl
 else
  # Update Bazel build configuration.
-  perl -pi -e "s,WITH_HDFS_SUPPORT = (False|True),WITH_HDFS_SUPPORT = False,s" tensorflow/core/platform/default/build_config.bzl
+  sed -i -e "s/WITH_HDFS_SUPPORT = True/WITH_HDFS_SUPPORT = False/" tensorflow/core/platform/default/build_config.bzl
 fi

 # Invoke python_config and set up symlinks to python includes
@ -131,11 +130,11 @@ done
 ## Set up Cuda-related environment settings

 while [ "$TF_NEED_CUDA" == "" ]; do
-  read -p "Do you wish to build TensorFlow with GPU support? [y/N] " INPUT
+  read -p "Do you wish to build TensorFlow with CUDA support? [y/N] " INPUT
  case $INPUT in
-    [Yy]* ) echo "GPU support will be enabled for TensorFlow"; TF_NEED_CUDA=1;;
-    [Nn]* ) echo "No GPU support will be enabled for TensorFlow"; TF_NEED_CUDA=0;;
-    "" ) echo "No GPU support will be enabled for TensorFlow"; TF_NEED_CUDA=0;;
+    [Yy]* ) echo "CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=1;;
+    [Nn]* ) echo "No CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=0;;
+    "" ) echo "No CUDA support will be enabled for TensorFlow"; TF_NEED_CUDA=0;;
    * ) echo "Invalid selection: " $INPUT;;
  esac
 done
@ -150,14 +149,15 @@ fi

 if [ "$TF_NEED_CUDA" == "1" ]; then
 # Set up which gcc nvcc should use as the host compiler
-while true; do
+# No need to set this on Windows
+while ! is_windows && true; do
  fromuser=""
  if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
    default_gcc_host_compiler_path=$(which gcc || true)
    read -p "Please specify which gcc should be used by nvcc as the host compiler. [Default is $default_gcc_host_compiler_path]: " GCC_HOST_COMPILER_PATH
    fromuser="1"
    if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
-      GCC_HOST_COMPILER_PATH=$default_gcc_host_compiler_path
+      GCC_HOST_COMPILER_PATH="$default_gcc_host_compiler_path"
    fi
  fi
  if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
@ -178,16 +178,23 @@ OSNAME=`uname -s`
 while true; do
  # Configure the Cuda SDK version to use.
  if [ -z "$TF_CUDA_VERSION" ]; then
-    read -p "Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to use system default]: " TF_CUDA_VERSION
+    read -p "Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to use system default]: " TF_CUDA_VERSION
  fi

  fromuser=""
  if [ -z "$CUDA_TOOLKIT_PATH" ]; then
    default_cuda_path=/usr/local/cuda
+    if is_windows; then
+      if [ -z "$CUDA_PATH" ]; then
+        default_cuda_path="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0"
+      else
+        default_cuda_path="$(cygpath -m "$CUDA_PATH")"
+      fi
+    fi
    read -p "Please specify the location where CUDA $TF_CUDA_VERSION toolkit is installed. Refer to README.md for more details. [Default is $default_cuda_path]: " CUDA_TOOLKIT_PATH
    fromuser="1"
    if [ -z "$CUDA_TOOLKIT_PATH" ]; then
-      CUDA_TOOLKIT_PATH=$default_cuda_path
+      CUDA_TOOLKIT_PATH="$default_cuda_path"
    fi
  fi

@ -197,7 +204,9 @@ while true; do
    TF_CUDA_EXT=".$TF_CUDA_VERSION"
  fi

-  if [ "$OSNAME" == "Linux" ]; then
+  if is_windows; then
+    CUDA_RT_LIB_PATH="lib/x64/cudart.lib"
+  elif [ "$OSNAME" == "Linux" ]; then
    CUDA_RT_LIB_PATH="lib64/libcudart.so${TF_CUDA_EXT}"
  elif [ "$OSNAME" == "Darwin" ]; then
    CUDA_RT_LIB_PATH="lib/libcudart${TF_CUDA_EXT}.dylib"
@ -235,14 +244,17 @@ while true; do
    fi
    # Result returned from "read" will be used unexpanded. That make "~" unuseable.
    # Going through one more level of expansion to handle that.
-    CUDNN_INSTALL_PATH=`${PYTHON_BIN_PATH} -c "import os; print(os.path.realpath(os.path.expanduser('${CUDNN_INSTALL_PATH}')))"`
+    CUDNN_INSTALL_PATH=`"${PYTHON_BIN_PATH}" -c "import os; print(os.path.realpath(os.path.expanduser('${CUDNN_INSTALL_PATH}')))"`
  fi

  if [[ -z "$TF_CUDNN_VERSION" ]]; then
    TF_CUDNN_EXT=""
    cudnn_lib_path=""
    cudnn_alt_lib_path=""
-    if [ "$OSNAME" == "Linux" ]; then
+    if is_windows; then
+      cudnn_lib_path="${CUDNN_INSTALL_PATH}/lib/x64/cudnn.lib"
+      cudnn_alt_lib_path="${CUDNN_INSTALL_PATH}/lib/x64/cudnn.lib"
+    elif [ "$OSNAME" == "Linux" ]; then
      cudnn_lib_path="${CUDNN_INSTALL_PATH}/lib64/libcudnn.so"
      cudnn_alt_lib_path="${CUDNN_INSTALL_PATH}/libcudnn.so"
    elif [ "$OSNAME" == "Darwin" ]; then
@ -255,9 +267,9 @@ while true; do
    # If the path is not a symlink, readlink will exit with an error code, so
    # in that case, we return the path itself.
    if [ -f "$cudnn_lib_path" ]; then
-      REALVAL=`readlink ${cudnn_lib_path} || echo "${cudnn_lib_path}"`
+      REALVAL=`readlink "${cudnn_lib_path}" || echo "${cudnn_lib_path}"`
    else
-      REALVAL=`readlink ${cudnn_alt_lib_path} || echo "${cudnn_alt_lib_path}"`
+      REALVAL=`readlink "${cudnn_alt_lib_path}" || echo "${cudnn_alt_lib_path}"`
    fi

    # Extract the version of the SONAME, if it was indeed symlinked to
@ -275,7 +287,10 @@ while true; do
    TF_CUDNN_EXT=".$TF_CUDNN_VERSION"
  fi

-  if [ "$OSNAME" == "Linux" ]; then
+  if is_windows; then
+    CUDA_DNN_LIB_PATH="lib/x64/cudnn.lib"
+    CUDA_DNN_LIB_ALT_PATH="lib/x64/cudnn.lib"
+  elif [ "$OSNAME" == "Linux" ]; then
    CUDA_DNN_LIB_PATH="lib64/libcudnn.so${TF_CUDNN_EXT}"
    CUDA_DNN_LIB_ALT_PATH="libcudnn.so${TF_CUDNN_EXT}"
  elif [ "$OSNAME" == "Darwin" ]; then
@ -350,6 +365,16 @@ EOF
  TF_CUDA_COMPUTE_CAPABILITIES=""
 done

+if is_windows; then
+  # The following three variables are needed for MSVC toolchain configuration in Bazel
+  export CUDA_PATH="$CUDA_TOOLKIT_PATH"
+  export CUDA_COMPUTE_CAPABILITIES="$TF_CUDA_COMPUTE_CAPABILITIES"
+  export NO_WHOLE_ARCHIVE_OPTION=1
+
+  # Set GCC_HOST_COMPILER_PATH to keep cuda_configure.bzl happy
+  export GCC_HOST_COMPILER_PATH="/usr/bin/dummy_compiler"
+fi
+
 # end of if "$TF_NEED_CUDA" == "1"
 fi

--- a/grpc.BUILD
+++ b/grpc.BUILD
@ -1561,6 +1561,7 @@ cc_library(
    "include/grpc++/impl/codegen/create_auth_context.h",
    "include/grpc++/impl/codegen/grpc_library.h",
    "include/grpc++/impl/codegen/method_handler_impl.h",
+    "include/grpc++/impl/codegen/proto_utils.h",
    "include/grpc++/impl/codegen/rpc_method.h",
    "include/grpc++/impl/codegen/rpc_service_method.h",
    "include/grpc++/impl/codegen/security/auth_context.h",
@ -1766,10 +1767,12 @@ cc_library(
    "include/grpc++/impl/codegen/completion_queue.h",
    "include/grpc++/impl/codegen/completion_queue_tag.h",
    "include/grpc++/impl/codegen/config.h",
+    "include/grpc++/impl/codegen/config_protobuf.h",        
    "include/grpc++/impl/codegen/core_codegen_interface.h",
    "include/grpc++/impl/codegen/create_auth_context.h",
    "include/grpc++/impl/codegen/grpc_library.h",
    "include/grpc++/impl/codegen/method_handler_impl.h",
+    "include/grpc++/impl/codegen/proto_utils.h",
    "include/grpc++/impl/codegen/rpc_method.h",
    "include/grpc++/impl/codegen/rpc_service_method.h",
    "include/grpc++/impl/codegen/security/auth_context.h",
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@ -63,7 +63,8 @@ TEST_F(NNGradTest, ReluGrad) {
  auto y = Relu(scope_, x);
  // Avoid input values where ReLU gradient is not well defined (around zero).
  Tensor x_init_value = test::AsTensor<float>(
-      {-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9}, {5, 2});
+      {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f, 0.1f, 0.3f, 0.5f, 0.7f, 0.9f},
+      {5, 2});
  RunTest(x, x_init_value, y, shape);
 }

@ -74,7 +75,8 @@ TEST_F(NNGradTest, Relu6Grad) {
  // Avoid input values where ReLU gradient is not well defined (around zero
  // and six).
  Tensor x_init_value = test::AsTensor<float>(
-      {-0.9, -0.7, -0.5, -0.3, -0.1, 6.1, 6.3, 6.5, 6.7, 6.9}, {5, 2});
+      {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f, 6.1f, 6.3f, 6.5f, 6.7f, 6.9f},
+      {5, 2});
  RunTest(x, x_init_value, y, shape);
 }

@ -83,7 +85,8 @@ TEST_F(NNGradTest, EluGrad) {
  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
  auto y = Elu(scope_, x);
  Tensor x_init_value = test::AsTensor<float>(
-      {-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9}, {5, 2});
+      {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f, 0.1f, 0.3f, 0.5f, 0.7f, 0.9f},
+      {5, 2});
  RunTest(x, x_init_value, y, shape);
 }

--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
@ -229,3 +229,8 @@ class ObservedStochasticTensorTest(tf.test.TestCase):
        distributions.Normal(mu=mu, sigma=sigma),
        value=tf.zeros(
            (1, 2), dtype=tf.int32))
+
+
+if __name__ == "__main__":
+  tf.test.main()
+
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
@ -42,8 +42,8 @@ class StochasticVariablesTest(tf.test.TestCase):

    self.assertEqual(
        {"stochastic_variables/sv_mu", "stochastic_variables/sv_sigma"},
-        set([v.op.name for v in tf.all_variables()]))
-    self.assertEqual(set(tf.trainable_variables()), set(tf.all_variables()))
+        set([v.op.name for v in tf.global_variables()]))
+    self.assertEqual(set(tf.trainable_variables()), set(tf.global_variables()))

    v = tf.convert_to_tensor(v)
    self.assertEqual(list(shape), v.get_shape().as_list())
@ -64,7 +64,7 @@ class StochasticVariablesTest(tf.test.TestCase):
            })):
      v = tf.get_variable("sv")

-    for var in tf.all_variables():
+    for var in tf.global_variables():
      if "mu" in var.name:
        mu_var = var
      if "sigma" in var.name:
@ -96,7 +96,7 @@ class StochasticVariablesTest(tf.test.TestCase):
            })):
      v = tf.get_variable("sv", shape)

-    for var in tf.all_variables():
+    for var in tf.global_variables():
      if "mu" in var.name:
        mu_var = var
      if "sigma" in var.name:
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@ -1,5 +1,5 @@
 # Minimum CMake required
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 3.5)

 # Project
 project(tensorflow C CXX)
@ -28,9 +28,11 @@ option(tensorflow_BUILD_CONTRIB_KERNELS "Build OpKernels from tensorflow/contrib
 option(tensorflow_BUILD_CC_TESTS "Build cc unit tests " OFF)
 option(tensorflow_BUILD_PYTHON_TESTS "Build python unit tests " OFF)

-#Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option for
-# targets that link ${CMAKE_THREAD_LIBS_INIT}.
-find_package (Threads)
+if (NOT WIN32)
+  # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
+  # for targets that link ${CMAKE_THREAD_LIBS_INIT}.
+  find_package (Threads)
+endif()

 # [CLEANUP] Remove when done
 # For debugging
@ -55,6 +57,7 @@ if(WIN32)
  add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC -D__VERSION__=\"MSVC\")
  add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
  add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH -D_ITERATOR_DEBUG_LEVEL=0)
+  add_definitions(-DNDEBUG /O2)  # Equivalent of -c opt in Bazel.
  add_definitions(/bigobj /nologo /EHsc /GF /FC /MP /Gm-)
  # Suppress warnings to reduce build log size.
  add_definitions(/wd4267 /wd4244 /wd4800 /wd4503 /wd4554 /wd4996 /wd4348 /wd4018)
@ -147,11 +150,11 @@ if (tensorflow_ENABLE_GPU)

    # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
    # CUDA_NVCC_FLAGS and cuda_config.h below
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=\"sm_30,compute_30\";-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
    set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
    include_directories(${CUDA_INCLUDE})
-    add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.5,5.2)
+    add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2)

    # add cudnn
    include_directories(${CUDNN_HOME})
@ -161,7 +164,7 @@ if (tensorflow_ENABLE_GPU)
    FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
      "#ifndef CUDA_CUDA_CONFIG_H_\n"
      "#define CUDA_CUDA_CONFIG_H_\n"
-      "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
+      "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
      "#define TF_CUDA_VERSION \"64_80\"\n"
      "#define TF_CUDNN_VERSION \"64_5\"\n"
      "#endif  // CUDA_CUDA_CONFIG_H_\n"
@ -207,6 +210,7 @@ if(tensorflow_BUILD_CC_EXAMPLE)
  include(tf_label_image_example.cmake)
 endif()
 if(tensorflow_BUILD_PYTHON_BINDINGS)
+  include(tensorboard)
  include(tf_python.cmake)
 endif()
 if (tensorflow_BUILD_CC_TESTS OR tensorflow_BUILD_PYTHON_TESTS)
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@ -21,7 +21,7 @@ Note: Windows support is in an **alpha** state, and we welcome your feedback.

 ### Pre-requisites

-* CMake version 3.1 up to 3.6
+* CMake version 3.5 up to 3.6

 * [Git](http://git-scm.com)

--- a/tensorflow/contrib/cmake/external/tensorboard.cmake
+++ b/tensorflow/contrib/cmake/external/tensorboard.cmake
@ -0,0 +1,134 @@
+include (ExternalProject)
+
+set(tensorboard_dependencies)
+add_custom_target(tensorboard_copy_dependencies)
+
+function(tb_new_http_archive)
+  cmake_parse_arguments(_TB "" "NAME;URL" "FILES" ${ARGN})
+  ExternalProject_Add(${_TB_NAME}
+    PREFIX ${_TB_NAME}
+    URL ${_TB_URL}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}/${_TB_NAME}"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ""
+  )
+
+  set(src_dir "${CMAKE_CURRENT_BINARY_DIR}/${_TB_NAME}/src/${_TB_NAME}")
+  set(dst_dir "${CMAKE_CURRENT_BINARY_DIR}/tensorboard_external/${_TB_NAME}")
+
+  foreach(src_file ${_TB_FILES})
+    add_custom_command(
+      TARGET tensorboard_copy_dependencies PRE_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${src_dir}/${src_file} ${dst_dir}/${src_file}
+    )
+  endforeach()
+  
+  set(tensorboard_dependencies ${tensorboard_dependencies} ${_TB_NAME} PARENT_SCOPE)
+endfunction()
+
+function(tb_http_file)
+  cmake_parse_arguments(_TB "" "NAME;URL" "" ${ARGN})
+  get_filename_component(src_file ${_TB_URL} NAME)
+  file(DOWNLOAD ${_TB_URL} "${DOWNLOAD_LOCATION}/${_TB_NAME}/${src_file}")
+  
+  set(src_dir "${DOWNLOAD_LOCATION}/${_TB_NAME}")
+  set(dst_dir "${CMAKE_CURRENT_BINARY_DIR}/tensorboard_external/${_TB_NAME}/file")
+  
+  add_custom_command(
+    TARGET tensorboard_copy_dependencies PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${src_dir}/${src_file} ${dst_dir}/${src_file}
+  )
+  
+  add_custom_target(${_TB_NAME} DEPENDS ${src_dir}/${src_file})
+  set(tensorboard_dependencies ${tensorboard_dependencies} ${_TB_NAME} PARENT_SCOPE)
+endfunction()
+
+# Parse TensorBoard dependency names and URLs from Bazel's WORKSPACE file.
+set(tb_dep_names)
+file(STRINGS ${PROJECT_SOURCE_DIR}/../../../WORKSPACE workspace_contents)
+foreach(line ${workspace_contents})
+  if(line MATCHES "# TENSORBOARD_BOWER_AUTOGENERATED_BELOW_THIS_LINE_DO_NOT_EDIT")
+    set(tb_deps_started 1)
+  endif()
+
+  if(NOT tb_deps_started)
+    continue()
+  endif()
+
+  if(line MATCHES "new_http_archive\\(")
+    set(tb_dep_is_archive 1)
+    continue()
+  elseif(line MATCHES "http_file\\(")
+    set(tb_dep_is_archive 0)
+    continue()
+  endif()
+
+  string(REGEX MATCH "name.*=.*\"(.*)\"" has_name ${line})
+  if(has_name)
+    set(tb_dep_name ${CMAKE_MATCH_1})
+    continue()
+  endif()
+
+  string(REGEX MATCH "url.*=.*\"(.*)\"" has_url ${line})
+  if(has_url)
+    list(APPEND tb_dep_names ${tb_dep_name})
+    set(${tb_dep_name}_is_archive ${tb_dep_is_archive})
+    set(${tb_dep_name}_url ${CMAKE_MATCH_1})
+  endif()
+endforeach()
+
+# Parse the files needed for each TensorBoard dependency from Bazel's bower.BUILD file.
+# Due to CMAKE quirkiness, cannot use file(strings) with files that contain '[' and ']'.
+file(READ ${PROJECT_SOURCE_DIR}/../../../bower.BUILD bower_build_contents)
+string(REPLACE "\[" "OB" bower_build_contents "${bower_build_contents}")
+string(REPLACE "\]" "CB" bower_build_contents "${bower_build_contents}")
+string(REPLACE ";" "\\\\;" bower_build_contents "${bower_build_contents}")
+string(REPLACE "\n" "E;" bower_build_contents "${bower_build_contents}")
+foreach(line ${bower_build_contents})
+  string(REGEX MATCH "name.*=.*\"(.*)\"" has_name ${line})
+  if(has_name)
+    set(tb_dep_name ${CMAKE_MATCH_1})
+    set(${tb_dep_name}_files)
+    continue()
+  endif()
+
+  string(REGEX MATCH "srcs.*=.*\"(.*)\"CB" has_single_line_src ${line})
+  if(has_single_line_src)
+    list(APPEND ${tb_dep_name}_files ${CMAKE_MATCH_1})
+    continue()
+  endif()
+
+  if(line MATCHES "srcs.*=.*OB")
+    set(inside_files_def 1)
+    continue()
+  elseif(line MATCHES "CB,")
+    set(inside_files_def 0)
+    continue()
+  endif()
+
+  if(inside_files_def)
+   string(REGEX MATCH "\"(.*)\"," has_file ${line})
+   if(has_file)
+     list(APPEND ${tb_dep_name}_files ${CMAKE_MATCH_1})
+   endif()
+  endif()
+endforeach()
+
+# Generate a target for each dependency.
+foreach(tb_dep_name ${tb_dep_names})
+  if (${tb_dep_name}_is_archive)
+    tb_new_http_archive(
+      NAME ${tb_dep_name}
+      URL ${${tb_dep_name}_url}
+      FILES ${${tb_dep_name}_files}
+    )
+  else()
+    tb_http_file(
+      NAME ${tb_dep_name}
+      URL ${${tb_dep_name}_url}
+    )
+  endif()
+endforeach()
+
+add_dependencies(tensorboard_copy_dependencies ${tensorboard_dependencies})
--- a/tensorflow/contrib/cmake/setup.py
+++ b/tensorflow/contrib/cmake/setup.py
@ -26,7 +26,7 @@ from setuptools import find_packages, setup, Command
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution

-_VERSION = '0.11.0-cmake-experimental'
+_VERSION = '0.12.0-rc0-cmake-experimental'

 REQUIRED_PACKAGES = [
    'numpy >= 1.11.0',
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@ -84,3 +84,36 @@ add_library(tf_cc_ops OBJECT
    "${tensorflow_source_dir}/tensorflow/cc/ops/const_op.cc"
    "${tensorflow_source_dir}/tensorflow/cc/ops/standard_ops.h"
 )
+
+########################################################
+# tf_cc library
+########################################################
+file(GLOB_RECURSE tf_cc_srcs
+    "${tensorflow_source_dir}/tensorflow/cc/client/*.h"
+    "${tensorflow_source_dir}/tensorflow/cc/client/*.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/gradients/*.h"
+    "${tensorflow_source_dir}/tensorflow/cc/gradients/*.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/saved_model/*.h"
+    "${tensorflow_source_dir}/tensorflow/cc/saved_model/*.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/training/*.h"
+    "${tensorflow_source_dir}/tensorflow/cc/training/*.cc"
+)
+
+set(tf_cc_srcs
+    ${tf_cc_srcs}
+    "${tensorflow_source_dir}/tensorflow/cc/framework/grad_op_registry.h"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/grad_op_registry.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/gradient_checker.h"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/gradient_checker.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/gradients.h"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/gradients.cc"
+)
+
+file(GLOB_RECURSE tf_cc_test_srcs
+    "${tensorflow_source_dir}/tensorflow/cc/*test*.cc"
+)
+
+list(REMOVE_ITEM tf_cc_srcs ${tf_cc_test_srcs})
+
+add_library(tf_cc OBJECT ${tf_cc_srcs})
+add_dependencies(tf_cc tf_cc_framework tf_cc_ops)
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@ -210,11 +210,7 @@ file(GLOB_RECURSE tf_core_framework_test_srcs
    "${tensorflow_source_dir}/tensorflow/core/util/*main.cc"
 )

-list(REMOVE_ITEM tf_core_framework_srcs ${tf_core_framework_test_srcs}
-    "${tensorflow_source_dir}/tensorflow/core/util/memmapped_file_system.cc"
-    "${tensorflow_source_dir}/tensorflow/core/util/memmapped_file_system.h"
-    "${tensorflow_source_dir}/tensorflow/core/util/memmapped_file_system_writer.cc"
-)
+list(REMOVE_ITEM tf_core_framework_srcs ${tf_core_framework_test_srcs})

 add_library(tf_core_framework OBJECT
    ${tf_core_framework_srcs}
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@ -90,8 +90,6 @@ list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_exclude_srcs})
 if(WIN32)
  file(GLOB_RECURSE tf_core_kernels_windows_exclude_srcs
      # not working on windows yet
-      "${tensorflow_source_dir}/tensorflow/core/kernels/depthwise_conv_op.cc"  # Cannot find symbol: tensorflow::LaunchConv2DOp<struct Eigen::ThreadPoolDevice, double>::launch(...).
-      "${tensorflow_source_dir}/tensorflow/core/kernels/fact_op.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*"
      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@ -9,6 +9,7 @@ set(tf_op_lib_names
    "io_ops"
    "linalg_ops"
    "logging_ops"
+    "losses"
    "math_ops"
    "nn_ops"
    "no_op"
@ -17,6 +18,7 @@ set(tf_op_lib_names
    "resource_variable_ops"
    "script_ops"
    "sdca_ops"
+    "set_ops"  
    "sendrecv_ops"
    "sparse_ops"
    "state_ops"
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@ -68,7 +68,7 @@ function(RELATIVE_PROTOBUF_GENERATE_PYTHON ROOT_DIR SRCS)
    add_custom_command(
      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/tf_python/${REL_DIR}/${FIL_WE}_pb2.py"
      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-      ARGS --python_out  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/ -I ${ROOT_DIR} -I ${PROTOBUF_INCLUDE_DIRS} ${ABS_FIL} 
+      ARGS --python_out  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/ -I ${ROOT_DIR} -I ${PROTOBUF_INCLUDE_DIRS} ${ABS_FIL}
      DEPENDS ${PROTOBUF_PROTOC_EXECUTABLE} protobuf
      COMMENT "Running Python protocol buffer compiler on ${FIL}"
      VERBATIM )
@ -118,12 +118,20 @@ RELATIVE_PROTOBUF_GENERATE_PYTHON(
    ${tensorflow_source_dir} PYTHON_PROTO_GENFILES ${tf_protos_python_srcs}
 )

+# NOTE(mrry): Avoid regenerating the tensorflow/core protos because this
+# can cause benign-but-failing-on-Windows-due-to-file-locking conflicts
+# when two rules attempt to generate the same file.
+file(GLOB_RECURSE tf_python_protos_cc_srcs RELATIVE ${tensorflow_source_dir}
+    "${tensorflow_source_dir}/tensorflow/python/*.proto"
+    "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/*.proto"
+    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/*.proto"
+)
 RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
-    ${tensorflow_source_dir} ${tf_protos_python_srcs}
+    ${tensorflow_source_dir} ${tf_python_protos_cc_srcs}
 )

 add_library(tf_python_protos_cc ${PROTO_SRCS} ${PROTO_HDRS})
-
+add_dependencies(tf_python_protos_cc tf_protos_cc)

 # tf_python_touchup_modules adds empty __init__.py files to all
 # directories containing Python code, so that Python will recognize
@ -141,7 +149,7 @@ function(add_python_module MODULE_NAME)
    set(options DONTCOPY)
    cmake_parse_arguments(ADD_PYTHON_MODULE "${options}" "" "" ${ARGN})
    add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
-        COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/tf_python/${MODULE_NAME}") 
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/tf_python/${MODULE_NAME}")
    add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
        COMMAND ${CMAKE_COMMAND} -E touch "${CMAKE_CURRENT_BINARY_DIR}/tf_python/${MODULE_NAME}/__init__.py")
    file(GLOB module_python_srcs RELATIVE ${tensorflow_source_dir}
@ -164,6 +172,9 @@ add_python_module("tensorflow/core/lib")
 add_python_module("tensorflow/core/lib/core")
 add_python_module("tensorflow/core/protobuf")
 add_python_module("tensorflow/core/util")
+add_python_module("tensorflow/examples")
+add_python_module("tensorflow/examples/tutorials")
+add_python_module("tensorflow/examples/tutorials/mnist")
 add_python_module("tensorflow/python")
 add_python_module("tensorflow/python/client")
 add_python_module("tensorflow/python/debug")
@ -172,6 +183,7 @@ add_python_module("tensorflow/python/debug/examples")
 add_python_module("tensorflow/python/debug/wrappers")
 add_python_module("tensorflow/python/framework")
 add_python_module("tensorflow/python/kernel_tests")
+add_python_module("tensorflow/python/layers")
 add_python_module("tensorflow/python/lib")
 add_python_module("tensorflow/python/lib/core")
 add_python_module("tensorflow/python/lib/io")
@ -179,6 +191,7 @@ add_python_module("tensorflow/python/ops")
 add_python_module("tensorflow/python/platform")
 add_python_module("tensorflow/python/platform/default")
 add_python_module("tensorflow/python/platform/summary")
+add_python_module("tensorflow/python/saved_model")
 add_python_module("tensorflow/python/summary")
 add_python_module("tensorflow/python/summary/impl")
 add_python_module("tensorflow/python/summary/writer")
@ -187,8 +200,13 @@ add_python_module("tensorflow/python/training")
 add_python_module("tensorflow/python/user_ops")
 add_python_module("tensorflow/python/util")
 add_python_module("tensorflow/python/util/protobuf")
-
-add_python_module("tensorflow/contrib/")
+add_python_module("tensorflow/tensorboard")
+add_python_module("tensorflow/tensorboard/backend")
+add_python_module("tensorflow/tensorboard/lib/python")
+add_python_module("tensorflow/tensorboard/plugins")
+add_python_module("tensorflow/tensorboard/plugins/projector")
+add_python_module("tensorflow/tensorboard/scripts")
+add_python_module("tensorflow/contrib")
 add_python_module("tensorflow/contrib/android")
 add_python_module("tensorflow/contrib/android/java")
 add_python_module("tensorflow/contrib/android/java/org")
@ -215,6 +233,7 @@ add_python_module("tensorflow/contrib/cudnn_rnn/ops")
 add_python_module("tensorflow/contrib/cudnn_rnn/python")
 add_python_module("tensorflow/contrib/cudnn_rnn/python/kernel_tests")
 add_python_module("tensorflow/contrib/cudnn_rnn/python/ops")
+add_python_module("tensorflow/contrib/deprecated")
 add_python_module("tensorflow/contrib/distributions")
 add_python_module("tensorflow/contrib/distributions/python")
 add_python_module("tensorflow/contrib/distributions/python/kernel_tests")
@ -256,6 +275,9 @@ add_python_module("tensorflow/contrib/ios_examples/camera/en.lproj")
 add_python_module("tensorflow/contrib/ios_examples/simple")
 add_python_module("tensorflow/contrib/ios_examples/simple/data")
 add_python_module("tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj")
+add_python_module("tensorflow/contrib/labeled_tensor")
+add_python_module("tensorflow/contrib/labeled_tensor/python")
+add_python_module("tensorflow/contrib/labeled_tensor/python/ops")
 add_python_module("tensorflow/contrib/layers")
 add_python_module("tensorflow/contrib/layers/kernels")
 add_python_module("tensorflow/contrib/layers/ops")
@ -279,6 +301,10 @@ add_python_module("tensorflow/contrib/learn/python/learn/preprocessing/tests")
 add_python_module("tensorflow/contrib/learn/python/learn/tests")
 add_python_module("tensorflow/contrib/learn/python/learn/tests/dataframe")
 add_python_module("tensorflow/contrib/learn/python/learn/utils")
+add_python_module("tensorflow/contrib/linalg")
+add_python_module("tensorflow/contrib/linalg/python")
+add_python_module("tensorflow/contrib/linalg/python/ops")
+add_python_module("tensorflow/contrib/linalg/python/kernel_tests")
 add_python_module("tensorflow/contrib/linear_optimizer")
 add_python_module("tensorflow/contrib/linear_optimizer/kernels")
 add_python_module("tensorflow/contrib/linear_optimizer/kernels/g3doc")
@ -329,8 +355,12 @@ add_python_module("tensorflow/contrib/slim/python")
 add_python_module("tensorflow/contrib/slim/python/slim")
 add_python_module("tensorflow/contrib/slim/python/slim/data")
 add_python_module("tensorflow/contrib/slim/python/slim/nets")
+add_python_module("tensorflow/contrib/solvers")
+add_python_module("tensorflow/contrib/solvers/python")
+add_python_module("tensorflow/contrib/solvers/python/ops")
 add_python_module("tensorflow/contrib/specs")
 add_python_module("tensorflow/contrib/specs/python")
+add_python_module("tensorflow/contrib/stat_summarizer")
 add_python_module("tensorflow/contrib/tensorboard")
 add_python_module("tensorflow/contrib/tensorboard/plugins")
 add_python_module("tensorflow/contrib/tensorboard/plugins/projector")
@ -350,10 +380,6 @@ add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/ops")
 add_python_module("tensorflow/contrib/tensor_forest/python")
 add_python_module("tensorflow/contrib/tensor_forest/python/kernel_tests")
 add_python_module("tensorflow/contrib/tensor_forest/python/ops")
-add_python_module("tensorflow/contrib/tensorboard")
-add_python_module("tensorflow/contrib/tensorboard")
-add_python_module("tensorflow/contrib/tensorboard/plugins")
-add_python_module("tensorflow/contrib/tensorboard/plugins/projector")
 add_python_module("tensorflow/contrib/testing")
 add_python_module("tensorflow/contrib/testing/python")
 add_python_module("tensorflow/contrib/testing/python/framework")
@ -366,6 +392,12 @@ add_python_module("tensorflow/contrib/training/python")
 add_python_module("tensorflow/contrib/training/python/training")
 add_python_module("tensorflow/contrib/util")

+# Additional directories with no Python sources.
+add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist")
+add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/lib/css")
+

 ########################################################
 # tf_python_op_gen_main library
@ -394,7 +426,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
    set(oneValueArgs DESTINATION)
    set(multiValueArgs ADDITIONAL_LIBRARIES)
    cmake_parse_arguments(GENERATE_PYTHON_OP_LIB
-      "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+      "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    if(NOT DEFINED GENERATE_PYTHON_OP_LIB_DESTINATION)
      # Default destination is tf_python/tensorflow/python/ops/gen_<...>.py.
      set(GENERATE_PYTHON_OP_LIB_DESTINATION
@ -424,7 +456,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
      COMMAND ${tf_python_op_lib_name}_gen_python @${tensorflow_source_dir}/tensorflow/python/ops/hidden_ops.txt 1 > ${GENERATE_PYTHON_OP_LIB_DESTINATION}
      DEPENDS ${tf_python_op_lib_name}_gen_python
    )
-    
+
    set(tf_python_ops_generated_files ${tf_python_ops_generated_files}
        ${GENERATE_PYTHON_OP_LIB_DESTINATION} PARENT_SCOPE)
 endfunction()
@ -441,12 +473,14 @@ GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
 GENERATE_PYTHON_OP_LIB("logging_ops")
+GENERATE_PYTHON_OP_LIB("losses")
 GENERATE_PYTHON_OP_LIB("nn_ops")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
+GENERATE_PYTHON_OP_LIB("set_ops")
 GENERATE_PYTHON_OP_LIB("state_ops")
 GENERATE_PYTHON_OP_LIB("sparse_ops")
 GENERATE_PYTHON_OP_LIB("string_ops")
@ -473,10 +507,13 @@ add_dependencies(tf_python_ops tf_python_op_gen_main)

 find_package(SWIG REQUIRED)
 # Generate the C++ and Python source code for the SWIG wrapper.
+# NOTE(mrry): We always regenerate the SWIG wrapper, which means that we must
+# always re-link the Python extension, but we don't have to track the
+# individual headers on which the SWIG wrapper depends.
 add_custom_command(
      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/pywrap_tensorflow.py"
             "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow.cc"
-      DEPENDS tf_python_touchup_modules
+      DEPENDS tf_python_touchup_modules __force_rebuild
      COMMAND ${SWIG_EXECUTABLE}
      ARGS -python -c++
           -I${tensorflow_source_dir}
@ -535,7 +572,7 @@ target_link_libraries(pywrap_tensorflow
    ${tf_core_gpu_kernels_lib}
    ${tensorflow_EXTERNAL_LIBRARIES}
    tf_protos_cc
-		tf_python_protos_cc
+    tf_python_protos_cc
    ${PYTHON_LIBRARIES}
 )

@ -545,11 +582,12 @@ target_link_libraries(pywrap_tensorflow
 add_custom_target(tf_python_build_pip_package)
 add_dependencies(tf_python_build_pip_package
    pywrap_tensorflow
+    tensorboard_copy_dependencies
    tf_python_copy_scripts_to_destination
    tf_python_touchup_modules
    tf_python_ops)
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/cmake/setup.py
+  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/setup.py
                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
 if(WIN32)
  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
@ -566,6 +604,33 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/MANIFEST.in
                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
+
+# Copy resources for TensorBoard.
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/tf_python/setup.py bdist_wheel
-  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tf_python)
+  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/dist/bazel-html-imports.html
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist/)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/dist/index.html
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist/)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/dist/tf-tensorboard.html
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist/)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/lib/css/global.css
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/lib/css/)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/TAG
+                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/)
+add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/tensorboard_external
+                                             ${CMAKE_CURRENT_BINARY_DIR}/tf_python/external)
+
+if(${tensorflow_ENABLE_GPU})
+  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+    COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/tf_python/setup.py bdist_wheel --project_name tensorflow_gpu
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tf_python)
+else()
+  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+    COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/tf_python/setup.py bdist_wheel
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tf_python)
+endif(${tensorflow_ENABLE_GPU})
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@ -70,17 +70,18 @@ function(AddTest)
  )

  foreach(datafile ${_AT_DATA})
+    file(RELATIVE_PATH datafile_rel ${tensorflow_source_dir} ${datafile})
    add_custom_command(
      TARGET ${_AT_TARGET} POST_BUILD
      COMMAND ${CMAKE_COMMAND} -E copy
-             "${CMAKE_CURRENT_SOURCE_DIR}/${datafile}"
-             "${testdir}/${datafile}"
-      DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${datafile}"
+             "${datafile}"
+             "${testdir}/${datafile_rel}"
+      DEPENDS "${datafile}"
    )
  endforeach()

  if (_AT_DEPENDS)
-    add_dependencies(${_AT_TARGET} ${_AT_DEPENDS})
+    add_dependencies(${_AT_TARGET} ${_AT_DEPENDS} googletest)
  endif()
 endfunction(AddTest)

@ -98,7 +99,7 @@ function(AddPythonTests)
  endif(_AT_DEPENDS)

  foreach(sourcefile ${_AT_SOURCES})
-    add_test(NAME ${sourcefile} COMMAND ${PYTHON_EXECUTABLE} ${sourcefile})
+    add_test(NAME ${sourcefile} COMMAND ${PYTHON_EXECUTABLE} ${sourcefile} WORKING_DIRECTORY ${tensorflow_source_dir})
    if (_AT_DEPENDS)
      add_dependencies(${_AT_TARGET} ${_AT_DEPENDS})
    endif()
@ -116,6 +117,9 @@ if (tensorflow_BUILD_PYTHON_TESTS)
  # include all test
  file(GLOB_RECURSE tf_test_src_py
    "${tensorflow_source_dir}/tensorflow/python/kernel_tests/*.py"
+    "${tensorflow_source_dir}/tensorflow/python/saved_model/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/training/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/tensorboard/*_test.py"
  )

  # exclude the onces we don't want
@ -143,22 +147,21 @@ if (tensorflow_BUILD_PYTHON_TESTS)
      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/cwise_ops_test.py"
      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/variable_scope_test.py"
      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/reshape_op_test.py"
+      "${tensorflow_source_dir}/tensorflow/tensorboard/backend/server_test.py"
      # int32/int64 mixup
      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/functional_ops_test.py"
      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/py_func_test.py"
-      # issues related to windows fs
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/io_ops_test.py"
-      # missing kernel
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/conv_ops_test.py"
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/depthwise_conv_op_test.py"
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py"
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/pool_test.py"
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/qr_op_test.py"
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/svd_op_test.py"
      # cuda launch failed
      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/diag_op_test.py"
      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/trace_op_test.py"
      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/one_hot_op_test.py" # gpu, T=uint8
+      # training tests
+      "${tensorflow_source_dir}/tensorflow/python/training/basic_session_run_hooks_test.py"  # Needs tf.contrib fix.
+      "${tensorflow_source_dir}/tensorflow/python/training/localhost_cluster_performance_test.py"  # Needs portpicker.
+      "${tensorflow_source_dir}/tensorflow/python/training/monitored_session_test.py"  # Needs tf.contrib fix.
+      "${tensorflow_source_dir}/tensorflow/python/training/saver_large_variable_test.py"  # Overflow error.
+      "${tensorflow_source_dir}/tensorflow/python/training/supervisor_test.py"  # Flaky I/O error on rename.
+      "${tensorflow_source_dir}/tensorflow/python/training/sync_replicas_optimizer_test.py"  # Needs portpicker.
    )
  endif()
  list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude})
@ -172,7 +175,7 @@ if (tensorflow_BUILD_CC_TESTS)
  #
  # cc unit tests. Be aware that by default we include 250+ tests which
  # will take time and space to build.
-  # If you wan to cut this down, for example to a specific test, modify
+  # If you want to cut this down, for example to a specific test, modify
  # tf_test_src_simple to your needs
  #

@ -195,6 +198,18 @@ if (tensorflow_BUILD_CC_TESTS)
    "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.cc"
  )

+  if(WIN32)
+     set(tf_src_testlib
+       ${tf_src_testlib}
+       "${tensorflow_source_dir}/tensorflow/core/platform/windows/test.cc"
+     )
+  else()
+     set(tf_src_testlib
+       ${tf_src_testlib}
+       "${tensorflow_source_dir}/tensorflow/core/platform/posix/test.cc"
+     )
+  endif()
+
  # include all test
  file(GLOB_RECURSE tf_test_src_simple
    "${tensorflow_source_dir}/tensorflow/cc/*_test.cc"
@ -204,6 +219,15 @@ if (tensorflow_BUILD_CC_TESTS)
    "${tensorflow_source_dir}/tensorflow/contrib/rnn/*_test.cc"
  )

+  # exclude the ones we don't want
+  set(tf_test_src_simple_exclude
+    # generally not working
+    "${tensorflow_source_dir}/tensorflow/cc/client/client_session_test.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/gradients_test.cc"
+    "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/call_options_test.cc"
+    "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/tensor_coding_test.cc"
+  )
+
  if (NOT tensorflow_ENABLE_GPU)
    # exclude gpu tests if we are not buildig for gpu
    set(tf_test_src_simple_exclude
@ -217,15 +241,6 @@ if (tensorflow_BUILD_CC_TESTS)
    )
  endif()

-  # exclude the onces we don't want
-  set(tf_test_src_simple_exclude
-    # generally not working
-    "${tensorflow_source_dir}/tensorflow/cc/client/client_session_test.cc"
-    "${tensorflow_source_dir}/tensorflow/cc/framework/gradients_test.cc"
-    "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/call_options_test.cc"
-    "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/tensor_coding_test.cc"
-  )
-
  if (WIN32)
    set(tf_test_src_simple_exclude
      ${tf_test_src_simple_exclude}
@ -235,68 +250,26 @@ if (tensorflow_BUILD_CC_TESTS)

      # TODO: test failing
      "${tensorflow_source_dir}/tensorflow/core/common_runtime/simple_placer_test.cc"
+      "${tensorflow_source_dir}/tensorflow/core/debug/debug_gateway_test.cc" # hangs
      "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/executor_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/quantized_reshape_op_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/requantization_range_op_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/requantize_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/restore_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/restore_v2_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/save_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/restore_op_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/quantize_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/lib/core/status_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/lib/strings/str_util_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/lib/strings/numbers_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/lib/monitoring/collection_registry_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/util/tensor_slice_reader_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/platform/file_system_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/platform/logging_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/platform/env_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/ops/math_grad_test.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/cudnn_rnn_ops_test.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops_test.cc" # status 5
      "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops_test.cc" # status 5

-      # TODO: not compiling
-      "${tensorflow_source_dir}/tensorflow/cc/framework/gradient_checker_test.cc"
-      "${tensorflow_source_dir}/tensorflow/cc/gradients/math_grad_test.cc"
-      "${tensorflow_source_dir}/tensorflow/cc/gradients/array_grad_test.cc"
-      "${tensorflow_source_dir}/tensorflow/cc/saved_model/loader_test.cc"
-      "${tensorflow_source_dir}/tensorflow/cc/training/queue_runner_test.cc"
-      "${tensorflow_source_dir}/tensorflow/cc/training/coordinator_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/nn_ops_test.cc"
+      # TODO: not compiling 
      "${tensorflow_source_dir}/tensorflow/core/kernels/quantization_utils_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/activation_ops_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/batch_norm_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/bias_add_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/concat_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/conv_ops_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/matmul_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/pooling_ops_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/quantize_down_and_shrink_range_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/non_max_suppression_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/fused_batch_norm_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/hexagon_graph_transferer_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/adjust_contrast_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/batch_norm_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/cast_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/colorspace_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/control_flow_ops_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/conv_ops_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/debug_ops_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/resize_bilinear_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/resize_nearest_neighbor_op_benchmark_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/spacetobatch_benchmark_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_add_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/summary_image_op_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/summary_op_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/quantized_activation_ops_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/quantized_bias_add_op_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/kernels/quantized_concat_op_test.cc"
@ -312,10 +285,8 @@ if (tensorflow_BUILD_CC_TESTS)
      "${tensorflow_source_dir}/tensorflow/core/platform/cloud/retrying_file_system_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/platform/cloud/time_util_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/platform/port_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/platform/profile_utils/cpu_utils_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/common_runtime/direct_session_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc"
+      "${tensorflow_source_dir}/tensorflow/core/platform/subprocess_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/master_test.cc"
@ -325,20 +296,9 @@ if (tensorflow_BUILD_CC_TESTS)
      "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/master_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/framework/partial_tensor_shape_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/lib/core/notification_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/lib/gtl/cleanup_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/lib/gtl/edit_distance_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/lib/strings/strcat_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/ops/array_grad_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/ops/nn_ops_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/example/example_parser_configuration_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/example/feature_util_test.cc"
      "${tensorflow_source_dir}/tensorflow/core/util/reporter_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/util/memmapped_file_system_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/util/sparse_sparse_tensor_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/debug/debug_gateway_test.cc"
-      "${tensorflow_source_dir}/tensorflow/core/debug/debug_io_utils_test.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/clustering_ops_test.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/bundle_shim_test.cc"
      "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/bundle_test.cc"
@ -348,7 +308,15 @@ if (tensorflow_BUILD_CC_TESTS)
    )
  endif()

-  list(REMOVE_ITEM tf_test_src_simple ${tf_test_src_simple_exclude})
+  # Tests for saved_model require data, so need to treat them separately.
+  file(GLOB tf_cc_saved_model_test_srcs
+    "${tensorflow_source_dir}/tensorflow/cc/saved_model/*_test.cc"
+  )
+
+  list(REMOVE_ITEM tf_test_src_simple
+    ${tf_test_src_simple_exclude}
+    ${tf_cc_saved_model_test_srcs}
+  )

  set(tf_test_lib tf_test_lib)
  add_library(${tf_test_lib} STATIC ${tf_src_testlib})
@ -360,6 +328,7 @@ if (tensorflow_BUILD_CC_TESTS)
    $<TARGET_OBJECTS:tf_core_cpu>
    $<TARGET_OBJECTS:tf_core_framework>
    $<TARGET_OBJECTS:tf_core_kernels>
+    $<TARGET_OBJECTS:tf_cc>
    $<TARGET_OBJECTS:tf_cc_framework>
    $<TARGET_OBJECTS:tf_cc_ops>
    $<TARGET_OBJECTS:tf_core_ops>
@ -375,10 +344,23 @@ if (tensorflow_BUILD_CC_TESTS)
    ${tensorflow_EXTERNAL_LIBRARIES}
  )

+  # All tests that require no data.
  AddTests(
    SOURCES ${tf_test_src_simple}
    OBJECTS ${tf_obj_test}
    LIBS ${tf_test_libs}
-    DEPENDS googletest
  )
+
+  # Tests for tensorflow/cc/saved_model.
+  file(GLOB_RECURSE tf_cc_saved_model_test_data
+    "${tensorflow_source_dir}/tensorflow/cc/saved_model/testdata/*"
+  )
+
+  AddTests(
+    SOURCES ${tf_cc_saved_model_test_srcs}
+    DATA ${tf_cc_saved_model_test_data}
+    OBJECTS ${tf_obj_test}
+    LIBS ${tf_test_libs}
+  )
+
 endif(tensorflow_BUILD_CC_TESTS)
--- a/tensorflow/contrib/framework/python/ops/prettyprint_ops_test.py
+++ b/tensorflow/contrib/framework/python/ops/prettyprint_ops_test.py
@ -52,7 +52,7 @@ class PrettyPrintOpsTest(tf.test.TestCase):
    a = tf.Variable(1.0)
    a = tf.contrib.framework.print_op(a)
    with self.test_session():
-      tf.initialize_all_variables().run()
+      tf.global_variables_initializer().run()
      a.eval()

 if __name__ == "__main__":
--- a/tensorflow/contrib/graph_editor/tests/transform_test.py
+++ b/tensorflow/contrib/graph_editor/tests/transform_test.py
@ -58,6 +58,20 @@ class TransformTest(tf.test.TestCase):
      self.assertEqual(t.name, t_.name)
      self.assertEqual(info.original(t_), t)

+  def test_copy_assert(self):
+    tf.reset_default_graph()
+    a = tf.constant(1)
+    b = tf.constant(1)
+    eq = tf.equal(a, b)
+    assert_op = tf.Assert(eq, [a, b])
+    with tf.control_dependencies([assert_op]):
+      _ = tf.add(a, b)
+    sgv = ge.make_view([assert_op, eq.op, a.op, b.op])
+    copier = ge.Transformer()
+    copied_sgv, info = copier(sgv, sgv.graph, "", "")
+    new_assert_op = info.transformed(assert_op)
+    self.assertIsNotNone(new_assert_op)
+
  def test_transform(self):
    transformer = ge.Transformer()
    def my_transform_op_handler(info, op):
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@ -446,10 +446,7 @@ class Transformer(object):
    # without any outputs. So the walk is now finalized from those roots.
    remaining_ops = [op for op in self._info.sgv.ops
                     if op not in self._info.transformed_ops]
-    remaining_roots = [
-        op for op in remaining_ops
-        if not op.outputs and not self._info.control_outputs.get(op)
-    ]
+    remaining_roots = [op for op in remaining_ops if not op.outputs]
    for op in remaining_roots:
      self._transform_op(op)

--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@ -1209,7 +1209,7 @@ class WeightedSumTest(tf.test.TestCase):
    logits, _, _ = tf.contrib.layers.weighted_sum_from_feature_columns(
        features, [hashed_sparse], num_outputs=5)
    with self.test_session():
-      tf.initialize_all_variables().run()
+      tf.global_variables_initializer().run()
      self.assertAllEqual(logits.eval().shape, [2, 5])

  def testWeightedSparseColumn(self):
@ -1244,7 +1244,7 @@ class WeightedSumTest(tf.test.TestCase):
        features, [weighted_ids], num_outputs=5)

    with self.test_session():
-      tf.initialize_all_variables().run()
+      tf.global_variables_initializer().run()
      tf.initialize_all_tables().run()
      self.assertAllEqual(logits.eval().shape, [2, 5])

@ -1846,7 +1846,7 @@ class WeightedSumTest(tf.test.TestCase):
                                                              [product],
                                                              num_outputs=1))
      with self.test_session() as sess:
-        tf.initialize_all_variables().run()
+        tf.global_variables_initializer().run()
        tf.initialize_all_tables().run()
        product_weights = column_to_variable[product][0]
        sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
@ -1862,7 +1862,7 @@ class WeightedSumTest(tf.test.TestCase):
                                                              [product],
                                                              num_outputs=1))
      with self.test_session() as sess:
-        tf.initialize_all_variables().run()
+        tf.global_variables_initializer().run()
        tf.initialize_all_tables().run()
        product_weights = column_to_variable[product][0]
        sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@ -171,7 +171,10 @@ def _fused_batch_norm(
      `batch_size`. The normalization is over all but the last dimension if
      `data_format` is `NHWC` and the second dimension if `data_format` is
      `NCHW`.
-    decay: decay for the moving average.
+    decay: decay for the moving average. Reasonable values for `decay` are close 
+      to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower 
+      `decay` value (recommend trying `decay`=0.9) if model experiences reasonably 
+      good training performance but poor validation and/or test performance.
    center: If True, subtract `beta`. If False, `beta` is ignored.
    scale: If True, multiply by `gamma`. If False, `gamma` is
      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
@ -396,7 +399,10 @@ def batch_norm(
      `batch_size`. The normalization is over all but the last dimension if
      `data_format` is `NHWC` and the second dimension if `data_format` is
      `NCHW`.
-    decay: decay for the moving average.
+    decay: decay for the moving average. Reasonable values for `decay` are close 
+      to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower 
+      `decay` value (recommend trying `decay`=0.9) if model experiences reasonably 
+      good training performance but poor validation and/or test performance.
    center: If True, subtract `beta`. If False, `beta` is ignored.
    scale: If True, multiply by `gamma`. If False, `gamma` is
      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
@ -1369,7 +1375,7 @@ def fully_connected(inputs,
  Raises:
    ValueError: if x has rank less than 2 or if its last dimension is not set.
  """
-  if not (isinstance(num_outputs, int) or isinstance(num_outputs, long)):
+  if not (isinstance(num_outputs, six.integer_types)):
    raise ValueError('num_outputs should be int or long, got %s.', num_outputs)

  layer_variable_getter = _build_variable_getter({'bias': 'biases'})
--- a/tensorflow/contrib/layers/python/layers/optimizers_test.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers_test.py
@ -195,7 +195,7 @@ class OptimizersTest(tf.test.TestCase):
      self.assertAlmostEqual(var_value, 9.8916, 4)
      self.assertEqual(global_step_value, 1)
      var_count = 0
-      for var in tf.all_variables():
+      for var in tf.global_variables():
        if var.name.startswith("OptimizeLoss/AdaptiveMaxNorm"):
          var_count += 1
      self.assertEqual(2, var_count)
@ -366,7 +366,7 @@ class AdaptiveClipping(tf.test.TestCase):
          decay=0.5)(grads_and_vars)

      var_dict = {}
-      for var in tf.all_variables():
+      for var in tf.global_variables():
        if var.name.startswith("AdaptiveMaxNorm"):
          var_dict[var.name.split(":")[0]] = var
      self.assertEqual(2, len(var_dict))
--- a/tensorflow/contrib/learn/python/learn/datasets/base.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base.py
@ -186,8 +186,8 @@ def _is_retriable(e):


@retry(initial_delay=1.0, max_delay=16.0, is_retriable=_is_retriable)
-def urlretrieve_with_retry(url, filename):
-  urllib.request.urlretrieve(url, filename)
+def urlretrieve_with_retry(url, filename=None):
+  return urllib.request.urlretrieve(url, filename)


 def maybe_download(filename, work_directory, source_url):
@ -205,11 +205,9 @@ def maybe_download(filename, work_directory, source_url):
    gfile.MakeDirs(work_directory)
  filepath = os.path.join(work_directory, filename)
  if not gfile.Exists(filepath):
-    with tempfile.NamedTemporaryFile() as tmpfile:
-      temp_file_name = tmpfile.name
-      urlretrieve_with_retry(source_url, temp_file_name)
-      gfile.Copy(temp_file_name, filepath)
-      with gfile.GFile(filepath) as f:
-        size = f.size()
-      print('Successfully downloaded', filename, size, 'bytes.')
+    temp_file_name, _ = urlretrieve_with_retry(source_url)
+    gfile.Copy(temp_file_name, filepath)
+    with gfile.GFile(filepath) as f:
+      size = f.size()
+    print('Successfully downloaded', filename, size, 'bytes.')
  return filepath
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@ -330,8 +330,8 @@ class BaseEstimator(

    # Features and labels TensorSignature objects.
    # TODO(wicke): Rename these to something more descriptive
-    self._features_info = None
-    self._labels_info = None
+    self._features_info = {}
+    self._labels_info = {}

    self._graph = None

@ -641,28 +641,29 @@ class BaseEstimator(
    return tensor_signature.create_example_parser_from_signatures(
        self._features_info, examples_batch)

-  def _check_inputs(self, features, labels):
-    if self._features_info is not None:
-      logging.debug('Given features: %s, required signatures: %s.',
-                    str(features), str(self._features_info))
-      if not tensor_signature.tensors_compatible(features, self._features_info):
-        raise ValueError('Features are incompatible with given information. '
+  def _check_inputs(self, features, labels, mode):
+    if mode in self._features_info:
+      logging.debug('Given features for mode %s: %s, required signatures: %s.',
+                    mode, str(features), str(self._features_info[mode]))
+
+      if not tensor_signature.tensors_compatible(features, self._features_info[mode]):
+        raise ValueError('Features for mode %s are incompatible with given information. '
                         'Given features: %s, required signatures: %s.' %
-                         (str(features), str(self._features_info)))
+                         (mode, str(features), str(self._features_info[mode])))
    else:
-      self._features_info = tensor_signature.create_signatures(features)
-      logging.debug('Setting feature info to %s.', str(self._features_info))
+      self._features_info[mode] = tensor_signature.create_signatures(features)
+      logging.debug('Setting feature info for mode %s to %s.', mode, str(self._features_info[mode]))
    if labels is not None:
-      if self._labels_info is not None:
+      if mode in self._labels_info:
        logging.debug('Given labels: %s, required signatures: %s.',
                      str(labels), str(self._labels_info))
-        if not tensor_signature.tensors_compatible(labels, self._labels_info):
-          raise ValueError('Labels are incompatible with given information. '
+        if not tensor_signature.tensors_compatible(labels, self._labels_info[mode]):
+          raise ValueError('Labels for mode %s are incompatible with given information. '
                           'Given labels: %s, required signatures: %s.' %
-                           (str(labels), str(self._labels_info)))
+                           (mode, str(labels), str(self._labels_info[mode])))
      else:
-        self._labels_info = tensor_signature.create_signatures(labels)
-        logging.debug('Setting labels info to %s', str(self._labels_info))
+        self._labels_info[mode] = tensor_signature.create_signatures(labels)
+        logging.debug('Setting labels info for mode %s to %s', mode, str(self._labels_info[mode]))

  def _train_model(self,
                   input_fn,
@ -699,8 +700,7 @@ class BaseEstimator(
      random_seed.set_random_seed(self._config.tf_random_seed)
      global_step = contrib_framework.create_global_step(g)
      features, labels = input_fn()
-      self._check_inputs(features, labels)
-
+      self._check_inputs(features, labels, model_fn_lib.ModeKeys.TRAIN)
      # The default return type of _get_train_ops is ModelFnOps. But there are
      # some subclasses of tf.contrib.learn.Estimator which override this
      # method and use the legacy signature, namely _get_train_ops returns a
@ -800,8 +800,7 @@ class BaseEstimator(
      random_seed.set_random_seed(self._config.tf_random_seed)
      global_step = contrib_framework.create_global_step(g)
      features, labels = input_fn()
-      self._check_inputs(features, labels)
-
+      self._check_inputs(features, labels, model_fn_lib.ModeKeys.EVAL)
      # The default return type of _get_eval_ops is ModelFnOps. But there are
      # some subclasses of tf.contrib.learn.Estimator which override this
      # method and use the legacy signature, namely _get_eval_ops returns an
@ -835,6 +834,29 @@ class BaseEstimator(
      return result[0]
    return result

+  def _set_infer_mode_feature_signature(self, features):
+    for mode in list(self._features_info.keys()):
+      if tensor_signature.tensors_compatible(features, self._features_info[mode]):
+        self._features_info[model_fn_lib.ModeKeys.INFER] = self._features_info[mode]
+        if mode in self._labels_info:
+          self._labels_info[model_fn_lib.ModeKeys.INFER] = (
+              self._labels_info[mode])
+        else:
+          self._labels_info[model_fn_lib.ModeKeys.INFER] = None
+        break
+
+    if model_fn_lib.ModeKeys.INFER not in self._features_info:
+      logging.warning('Features for mode %s are incompatible with neither train mode nor eval mode.'
+                      ' Given features: %s' % (model_fn_lib.ModeKeys.INFER, str(features)))
+      for mode in list(self._features_info.keys()):
+        logging.warning('Whereas %s mode signatures: %s' % (mode, str(self._features_info[mode])))
+      self._check_inputs(features, None, model_fn_lib.ModeKeys.INFER)
+      if model_fn_lib.ModeKeys.TRAIN in self._labels_info:
+        logging.warning('Setting labels info for mode infer equal to that of labels info for train mode')
+        self._labels_info[model_fn_lib.ModeKeys.INFER] = self._labels_info[model_fn_lib.ModeKeys.TRAIN]
+      else:
+        self._labels_info[model_fn_lib.ModeKeys.INFER] = {}
+
  def _infer_model(
      self, input_fn, feed_fn=None, outputs=None, as_iterable=True):
    # Check that model has been trained.
@ -1134,8 +1156,10 @@ class Estimator(BaseEstimator):
    Returns:
      `ModelFnOps` object.
    """
+
+    self._set_infer_mode_feature_signature(features)
    labels = tensor_signature.create_placeholders_from_signatures(
-        self._labels_info)
+        self._labels_info[model_fn_lib.ModeKeys.INFER])
    return self._call_model_fn(features, labels, model_fn_lib.ModeKeys.INFER)

  @experimental
@ -1239,7 +1263,7 @@ class Estimator(BaseEstimator):
      return export_dir


-# For time of deprecation x,y from Estimator allow direct access.
+# For time of deprecation x,y from Estimator allow direct access
 # pylint: disable=protected-access
 class SKCompat(sklearn.BaseEstimator):
  """Scikit learn wrapper for TensorFlow Learn Estimator."""
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@ -91,7 +91,18 @@ def boston_eval_fn():
                                                             0)


+def extract(data, key):
+  if isinstance(data, dict):
+    assert key in data
+    return data[key]
+  else:
+    return data
+
+
 def linear_model_params_fn(features, labels, mode, params):
+  features = extract(features, 'input')
+  labels = extract(labels, 'labels')
+
  assert mode in (
      tf.contrib.learn.ModeKeys.TRAIN,
      tf.contrib.learn.ModeKeys.EVAL,
@ -106,6 +117,8 @@ def linear_model_params_fn(features, labels, mode, params):


 def linear_model_fn(features, labels, mode):
+  features = extract(features, 'input')
+  labels = extract(labels, 'labels')
  assert mode in (
      tf.contrib.learn.ModeKeys.TRAIN,
      tf.contrib.learn.ModeKeys.EVAL,
@ -140,8 +153,8 @@ def linear_model_fn_with_model_fn_ops(features, labels, mode):


 def logistic_model_no_mode_fn(features, labels):
-  if isinstance(labels, dict):
-    labels = labels['labels']
+  features = extract(features, 'input')
+  labels = extract(labels, 'labels')
  labels = tf.one_hot(labels, 3, 1, 0)
  prediction, loss = (
      tf.contrib.learn.models.logistic_regression_zero_init(features, labels)
@ -346,6 +359,34 @@ class EstimatorTest(tf.test.TestCase):
    with self.assertRaises(tf.contrib.learn.NotFittedError):
      est.predict(x=boston.data)

+  def testContinueTrainingDictionaryInput(self):
+    boston = tf.contrib.learn.datasets.load_boston()
+    output_dir = tempfile.mkdtemp()
+    est = tf.contrib.learn.Estimator(model_fn=linear_model_fn,
+                                     model_dir=output_dir)
+    boston_input = {'input': boston.data}
+    float64_target = {'labels': boston.target.astype(np.float64)}
+    est.fit(x=boston_input, y=float64_target, steps=50)
+    scores = est.evaluate(
+      x=boston_input,
+      y=float64_target,
+      metrics={'MSE': tf.contrib.metrics.streaming_mean_squared_error})
+    del est
+    # Create another estimator object with the same output dir.
+    est2 = tf.contrib.learn.Estimator(model_fn=linear_model_fn,
+                                      model_dir=output_dir)
+
+    # Check we can evaluate and predict.
+    scores2 = est2.evaluate(
+      x=boston_input,
+      y=float64_target,
+      metrics={'MSE': tf.contrib.metrics.streaming_mean_squared_error})
+    self.assertAllClose(scores2['MSE'],
+                        scores['MSE'])
+    predictions = np.array(list(est2.predict(x=boston_input)))
+    other_score = _sklearn.mean_squared_error(predictions, float64_target['labels'])
+    self.assertAllClose(other_score, scores['MSE'])
+
  def testContinueTraining(self):
    boston = tf.contrib.learn.datasets.load_boston()
    output_dir = tempfile.mkdtemp()
@ -405,6 +446,22 @@ class EstimatorTest(tf.test.TestCase):
    self.assertTrue('global_step' in scores)
    self.assertEqual(100, scores['global_step'])

+  def testBostonAllDictionaryInput(self):
+    boston = tf.contrib.learn.datasets.load_boston()
+    est = tf.contrib.learn.Estimator(model_fn=linear_model_fn)
+    boston_input = {'input': boston.data}
+    float64_target = {'labels': boston.target.astype(np.float64)}
+    est.fit(x=boston_input, y=float64_target, steps=100)
+    scores = est.evaluate(
+      x=boston_input,
+      y=float64_target,
+      metrics={'MSE': tf.contrib.metrics.streaming_mean_squared_error})
+    predictions = np.array(list(est.predict(x=boston_input)))
+    other_score = _sklearn.mean_squared_error(predictions, boston.target)
+    self.assertAllClose(other_score, scores['MSE'])
+    self.assertTrue('global_step' in scores)
+    self.assertEqual(scores['global_step'], 100)
+
  def testIrisAll(self):
    iris = tf.contrib.learn.datasets.load_iris()
    est = tf.contrib.learn.SKCompat(
@ -428,6 +485,31 @@ class EstimatorTest(tf.test.TestCase):
    self.assertTrue('global_step' in scores)
    self.assertEqual(100, scores['global_step'])

+  def testIrisAllDictionaryInput(self):
+    iris = tf.contrib.learn.datasets.load_iris()
+    est = tf.contrib.learn.Estimator(model_fn=logistic_model_no_mode_fn)
+    iris_data = {'input': iris.data}
+    iris_target = {'labels': iris.target}
+    est.fit(iris_data, iris_target, steps=100)
+    scores = est.evaluate(
+      x=iris_data,
+      y=iris_target,
+      metrics={('accuracy', 'class'): tf.contrib.metrics.streaming_accuracy})
+    predictions = list(est.predict(x=iris_data))
+    predictions_class = list(est.predict(x=iris_data, outputs=['class']))
+    self.assertEqual(len(predictions), iris.target.shape[0])
+    classes_batch = np.array([p['class'] for p in predictions])
+    self.assertAllClose(
+      classes_batch,
+      np.array([p['class'] for p in predictions_class]))
+    self.assertAllClose(
+      classes_batch,
+      np.argmax(np.array([p['prob'] for p in predictions]), axis=1))
+    other_score = _sklearn.accuracy_score(iris.target, classes_batch)
+    self.assertAllClose(other_score, scores['accuracy'])
+    self.assertTrue('global_step' in scores)
+    self.assertEqual(scores['global_step'], 100)
+
  def testIrisInputFn(self):
    iris = tf.contrib.learn.datasets.load_iris()
    est = tf.contrib.learn.Estimator(model_fn=logistic_model_no_mode_fn)
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@ -660,9 +660,10 @@ class LinearRegressor(evaluable.Evaluable, trainable.Trainable):
    """
    self._feature_columns = feature_columns
    assert self._feature_columns
-    self._optimizer = _get_default_optimizer(feature_columns)
    if optimizer:
      self._optimizer = _get_optimizer(optimizer)
+    else:
+      self._optimizer = _get_default_optimizer(feature_columns)

    chief_hook = None
    if (isinstance(optimizer, sdca_optimizer.SDCAOptimizer) and
--- a/tensorflow/contrib/learn/python/learn/evaluable.py
+++ b/tensorflow/contrib/learn/python/learn/evaluable.py
@ -51,12 +51,14 @@ class Evaluable(object):
    for which this evaluation was performed.

    Args:
-      x: Matrix of shape [n_samples, n_features...] containing the input samples
-         for fitting the model. Can be iterator that returns arrays of features.
-         If set, `input_fn` must be `None`.
+      x: Matrix of shape [n_samples, n_features...] or dictionary of many matrices
+         containing the input samples for fitting the model. Can be iterator that returns
+         arrays of features or dictionary of array of features. If set, `input_fn` must
+         be `None`.
      y: Vector or matrix [n_samples] or [n_samples, n_outputs] containing the
         label values (class labels in classification, real numbers in
-         regression). Can be iterator that returns array of labels. If set,
+         regression) or dictionary of multiple vectors/matrices. Can be iterator
+         that returns array of targets or dictionary of array of targets. If set,
         `input_fn` must be `None`. Note: For classification, label values must
         be integers representing the class index (i.e. values from 0 to
         n_classes-1).
--- a/tensorflow/contrib/learn/python/learn/graph_actions.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions.py
@ -299,10 +299,10 @@ def _monitored_train(graph,
      while not super_sess.should_stop():
        _, loss = super_sess.run([train_op, loss_op], feed_fn() if feed_fn else
                                 None)
+
    summary_io.SummaryWriterCache.clear()
    return loss

-
 # TODO(ispir): Deprecate train in favor of supervised_train
 def train(graph,
          output_dir,
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@ -36,27 +36,49 @@ from tensorflow.python.platform import tf_logging as logging
 # pylint: disable=g-multiple-import,g-bad-import-order
 from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels
 from .dask_io import HAS_DASK, extract_dask_data, extract_dask_labels
+
+
 # pylint: enable=g-multiple-import,g-bad-import-order


 def _get_in_out_shape(x_shape, y_shape, n_classes, batch_size=None):
  """Returns shape for input and output of the data feeder."""
+  x_is_dict, y_is_dict = isinstance(x_shape, dict), y_shape is not None and isinstance(y_shape, dict)
+  if y_is_dict and n_classes is not None:
+    assert (isinstance(n_classes, dict))
+
  if batch_size is None:
-    batch_size = x_shape[0]
+    batch_size = list(x_shape.values())[0][0] if x_is_dict else x_shape[0]
  elif batch_size <= 0:
    raise ValueError('Invalid batch_size %d.' % batch_size)
-  x_shape = list(x_shape[1:]) if len(x_shape) > 1 else [1]
-  input_shape = [batch_size] + x_shape
+
+  if x_is_dict:
+    input_shape = {}
+    for k, v in list(x_shape.items()):
+      input_shape[k] = [batch_size] + (list(v[1:]) if len(v) > 1 else [1])
+  else:
+    x_shape = list(x_shape[1:]) if len(x_shape) > 1 else [1]
+    input_shape = [batch_size] + x_shape
+
  if y_shape is None:
    return input_shape, None, batch_size
-  y_shape = list(y_shape[1:]) if len(y_shape) > 1 else []
-  # Skip first dimension if it is 1.
-  if y_shape and y_shape[0] == 1:
-    y_shape = y_shape[1:]
-  if n_classes is not None and n_classes > 1:
-    output_shape = [batch_size] + y_shape + [n_classes]
+
+  def out_el_shape(out_shape, num_classes):
+    out_shape = list(out_shape[1:]) if len(out_shape) > 1 else []
+    # Skip first dimension if it is 1.
+    if out_shape and out_shape[0] == 1:
+      out_shape = out_shape[1:]
+    if num_classes is not None and num_classes > 1:
+      return [batch_size] + out_shape + [num_classes]
+    else:
+      return [batch_size] + out_shape
+
+  if not y_is_dict:
+    output_shape = out_el_shape(y_shape, n_classes)
  else:
-    output_shape = [batch_size] + y_shape
+    output_shape = dict([(k, out_el_shape(v, n_classes[k] if n_classes is not None and k in n_classes else None))
+                         for k, v in list(y_shape.items())])
+
  return input_shape, output_shape, batch_size


@ -78,15 +100,18 @@ def _is_iterable(x):


 def setup_train_data_feeder(
-    x, y, n_classes, batch_size=None, shuffle=True, epochs=None):
+        x, y, n_classes, batch_size=None, shuffle=True, epochs=None):
  """Create data feeder, to sample inputs from dataset.

  If `x` and `y` are iterators, use `StreamingDataFeeder`.

  Args:
-    x: numpy, pandas or Dask matrix or iterable.
-    y: numpy, pandas or Dask array or iterable.
-    n_classes: number of classes.
+    x: numpy, pandas or Dask matrix or dictionary of aforementioned. Also
+      supports iterables.
+    y: numpy, pandas or Dask array or dictionary of aforementioned. Also supports
+      iterables.
+    n_classes: number of classes. Must be None or same type as y. In case, `y` is `dict`
+      (or iterable which returns dict) such that `n_classes[key] = n_classes for y[key]`
    batch_size: size to split data into parts. Must be >= 1.
    shuffle: Whether to shuffle the inputs.
    epochs: Number of epochs to run.
@ -102,7 +127,7 @@ def setup_train_data_feeder(
    # pylint: disable=g-import-not-at-top
    import dask.dataframe as dd
    if (isinstance(x, (dd.Series, dd.DataFrame)) and
-        (y is None or isinstance(y, (dd.Series, dd.DataFrame)))):
+          (y is None or isinstance(y, (dd.Series, dd.DataFrame)))):
      data_feeder_cls = DaskDataFeeder
    else:
      data_feeder_cls = DataFeeder
@ -115,31 +140,54 @@ def setup_train_data_feeder(
                       'streaming learning to work.')
    return StreamingDataFeeder(x, y, n_classes, batch_size)
  return data_feeder_cls(
-      x, y, n_classes, batch_size, shuffle=shuffle, epochs=epochs)
+    x, y, n_classes, batch_size, shuffle=shuffle, epochs=epochs)


 def _batch_data(x, batch_size=None):
  if (batch_size is not None) and (batch_size <= 0):
    raise ValueError('Invalid batch_size %d.' % batch_size)
-  chunk = []
+
+  x_first_el = six.next(x)
+  x = itertools.chain([x_first_el], x)
+
+  chunk = dict([(k, []) for k in list(x_first_el.keys())]) if isinstance(x_first_el, dict) else []
+  chunk_filled = False
  for data in x:
-    chunk.append(data)
-    if (batch_size is not None) and (len(chunk) >= batch_size):
-      yield np.matrix(chunk)
-      chunk = []
-  yield np.matrix(chunk)
+    if isinstance(data, dict):
+      for k, v in list(data.items()):
+        chunk[k].append(v)
+        if (batch_size is not None) and (len(chunk[k]) >= batch_size):
+          chunk[k] = np.matrix(chunk[k])
+          chunk_filled = True
+      if chunk_filled:
+        yield chunk
+        chunk = dict([(k, []) for k in list(x_first_el.keys())]) if isinstance(x_first_el, dict) else []
+        chunk_filled = False
+    else:
+      chunk.append(data)
+      if (batch_size is not None) and (len(chunk) >= batch_size):
+        yield np.matrix(chunk)
+        chunk = []
+
+  if isinstance(x_first_el, dict):
+    for k, v in list(data.items()):
+      chunk[k] = np.matrix(chunk[k])
+    yield chunk
+  else:
+    yield np.matrix(chunk)


 def setup_predict_data_feeder(x, batch_size=None):
  """Returns an iterable for feeding into predict step.

  Args:
-    x: numpy, pandas, Dask array or iterable.
-    batch_size: Size of batches to split data into.
-      If `None`, returns one batch of full size.
+    x: numpy, pandas, Dask array or dictionary of aforementioned. Also supports
+      iterable.
+    batch_size: Size of batches to split data into. If `None`, returns one
+      batch of full size.

  Returns:
-    List or iterator of parts of data to predict on.
+    List or iterator (or dictionary thereof) of parts of data to predict on.

  Raises:
    ValueError: if `batch_size` <= 0.
@ -211,7 +259,7 @@ def _access(data, iloc):
 def _check_dtype(dtype):
  if dtypes.as_dtype(dtype) == dtypes.float64:
    logging.warn(
-        'float64 is not supported by many models, consider casting to float32.')
+      'float64 is not supported by many models, consider casting to float32.')
  return dtype


@ -219,63 +267,85 @@ class DataFeeder(object):
  """Data feeder is an example class to sample data for TF trainer."""

  def __init__(
-      self, x, y, n_classes, batch_size=None, shuffle=True, random_state=None,
-      epochs=None):
+          self, x, y, n_classes, batch_size=None, shuffle=True, random_state=None,
+          epochs=None):
    """Initializes a DataFeeder instance.

    Args:
-      x: Feature Nd numpy matrix of shape `[n_samples, n_features, ...]`.
-      y: Label vector, either floats for regression or class id for
-        classification. If matrix, will consider as a sequence
-        of labels. Can be `None` for unsupervised setting.
+      x: One feature sample which can either Nd numpy matrix of shape
+        `[n_samples, n_features, ...]` or dictionary of Nd numpy matrix.
+      y: label vector, either floats for regression or class id for
+        classification. If matrix, will consider as a sequence of labels.
+        Can be `None` for unsupervised setting. Also supports dictionary of
+        labels.
      n_classes: Number of classes, 0 and 1 are considered regression, `None`
-        will pass through the input labels without one-hot conversion.
-      batch_size: Mini-batch size to accumulate.
+        will pass through the input labels without one-hot conversion. Also, if
+        `y` is `dict`, then `n_classes` must be `dict` such that
+        `n_classes[key] = n_classes for label y[key]`, `None` otherwise.
+      batch_size: Mini-batch size to accumulate samples in one mini batch.
      shuffle: Whether to shuffle `x`.
      random_state: Numpy `RandomState` object to reproduce sampling.
      epochs: Number of times to iterate over input data before raising
        `StopIteration` exception.

    Attributes:
-      x: Input features.
-      y: Input label.
+      x: Input features (ndarray or dictionary of ndarrays).
+      y: Input label (ndarray or dictionary of ndarrays).
      n_classes: Number of classes (if `None`, pass through indices without
        one-hot conversion).
      batch_size: Mini-batch size to accumulate.
-      input_shape: Shape of the input.
-      output_shape: Shape of the output.
-      input_dtype: DType of input.
-      output_dtype: DType of output.
+      input_shape: Shape of the input (or dictionary of shapes).
+      output_shape: Shape of the output (or dictionary of shapes).
+      input_dtype: DType of input (or dictionary of shapes).
+      output_dtype: DType of output (or dictionary of shapes.
    """
-    self._x = check_array(x, dtype=x.dtype)
-    # self.n_classes is None means we're passing in raw label indices.
-    y_dtype = (
-        np.int64 if n_classes is not None and n_classes > 1 else np.float32)
+    x_is_dict, y_is_dict = isinstance(x, dict), y is not None and isinstance(y, dict)
+    if isinstance(y, list):
+      y = np.array(y)
+
+    self._x = dict([(k, check_array(v, v.dtype)) for k, v in list(x.items())]) if x_is_dict else check_array(x, x.dtype)
+    self._y = None if y is None else \
+      dict([(k, check_array(v, v.dtype)) for k, v in list(y.items())]) if x_is_dict else check_array(y, y.dtype)
+
+    # self.n_classes is not None means we're converting raw target indices to one-hot.
    if n_classes is not None:
-      self._y = (None if y is None else check_array(y, dtype=y_dtype))
-    elif isinstance(y, list):
-      self._y = np.array(y)
-    else:
-      self._y = y
+      if not y_is_dict:
+        y_dtype = (np.int64 if n_classes is not None and n_classes > 1 else np.float32)
+        self._y = (None if y is None else check_array(y, dtype=y_dtype))
+
    self.n_classes = n_classes
    self.max_epochs = epochs
+
+    x_shape = dict([(k, v.shape) for k, v in list(self._x.items())]) if x_is_dict else self._x.shape
+    y_shape = dict(
+      [(k, v.shape) for k, v in list(self._y.items())]) if y_is_dict else None if y is None else self._y.shape
+
    self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
-        self._x.shape, None if self._y is None else self._y.shape, n_classes,
-        batch_size)
+      x_shape, y_shape, n_classes, batch_size)
+
    # Input dtype matches dtype of x.
-    self._input_dtype = _check_dtype(self._x.dtype)
-    # self.n_classes is None means we're passing in raw label indices
-    if n_classes is not None or self._y is None:
-      self._output_dtype = np.float32
-    else:
-      self._output_dtype = _check_dtype(self._y.dtype)
+    self._input_dtype = dict([(k, _check_dtype(v.dtype)) for k, v in list(self._x.items())]) if x_is_dict \
+      else _check_dtype(self._x.dtype)
+
+    # note: self._output_dtype = np.float32 when y is None
+    self._output_dtype = dict([(k, _check_dtype(v.dtype)) for k, v in list(self._y.items())]) if y_is_dict \
+      else _check_dtype(self._y.dtype) if y is not None else np.float32
+
+    # self.n_classes is None means we're passing in raw target indices
+    if n_classes is not None and y_is_dict:
+      for key in list(n_classes.keys()):
+        if key in self._output_dtype:
+          self._output_dtype[key] = np.float32
+
    self._shuffle = shuffle
    self.random_state = np.random.RandomState(
-        42) if random_state is None else random_state
+      42) if random_state is None else random_state
+
+    num_samples = list(self._x.values())[0].shape[0] if x_is_dict else self._x.shape[0]
    if self._shuffle:
-      self.indices = self.random_state.permutation(self._x.shape[0])
+      self.indices = self.random_state.permutation(num_samples)
    else:
-      self.indices = np.array(range(self._x.shape[0]))
+      self.indices = np.array(range(num_samples))
    self.offset = 0
    self.epoch = 0
    self._epoch_placeholder = None
@ -320,19 +390,27 @@ class DataFeeder(object):
    Returns:
      Two placeholders for inputs and outputs.
    """
-    input_shape = [None] + self.input_shape[1:]
-    self._input_placeholder = array_ops.placeholder(
-        dtypes.as_dtype(self._input_dtype),
-        input_shape,
-        name='input')
-    if self.output_shape is None:
-      self._output_placeholder = None
-    else:
-      output_shape = [None] + self.output_shape[1:]
-      self._output_placeholder = array_ops.placeholder(
-          dtypes.as_dtype(self._output_dtype),
-          output_shape,
-          name='output')
+
+    def get_placeholder(shape, dtype, name_prepend):
+      if shape is None:
+        return None
+      if isinstance(shape, dict):
+        placeholder = {}
+        for key in list(shape.keys()):
+          placeholder[key] = array_ops.placeholder(
+            dtypes.as_dtype(dtype[key]),
+            [None] + shape[key][1:],
+            name=name_prepend + '_' + key
+          )
+      else:
+        placeholder = array_ops.placeholder(
+          dtypes.as_dtype(dtype),
+          [None] + shape[1:],
+          name=name_prepend)
+      return placeholder
+
+    self._input_placeholder = get_placeholder(self.input_shape, self._input_dtype, 'input')
+    self._output_placeholder = get_placeholder(self.output_shape, self._output_dtype, 'output')
    return self._input_placeholder, self._output_placeholder

  def set_placeholders(self, input_placeholder, output_placeholder):
@ -342,21 +420,21 @@ class DataFeeder(object):
      input_placeholder: Placeholder for `x` variable. Should match shape
        of the examples in the x dataset.
      output_placeholder: Placeholder for `y` variable. Should match
-        shape of the examples in the y dataset. Can be None.
+        shape of the examples in the y dataset. Can be `None`.
    """
    self._input_placeholder = input_placeholder
    self._output_placeholder = output_placeholder

  def get_feed_params(self):
-    """Function returns a dict with data feed params while training.
+    """Function returns a `dict` with data feed params while training.

    Returns:
-      A dict with data feed params while training.
+      A `dict` with data feed params while training.
    """
    return {
-        'epoch': self.epoch,
-        'offset': self.offset,
-        'batch_size': self._batch_size
+      'epoch': self.epoch,
+      'offset': self.offset,
+      'batch_size': self._batch_size
    }

  def get_feed_dict_fn(self):
@ -364,8 +442,35 @@ class DataFeeder(object):

    Returns:
      A function that when called samples a random subset of batch size
-      from x and y.
+      from `x` and `y`.
    """
+    x_is_dict, y_is_dict = isinstance(self._x, dict), self._y is not None and isinstance(self._y, dict)
+
+    # Assign input features from random indices.
+    def extract(data, indices):
+      return (np.array(_access(data, indices)).reshape((indices.shape[0], 1))
+              if len(data.shape) == 1 else _access(data, indices))
+
+    # assign labels from random indices
+    def assign_label(data, shape, dtype, n_classes, indices):
+      shape[0] = indices.shape[0]
+      out = np.zeros(shape, dtype=dtype)
+      for i in xrange(out.shape[0]):
+        sample = indices[i]
+        # self.n_classes is None means we're passing in raw target indices
+        if n_classes is None:
+          out[i] = _access(data, sample)
+        else:
+          if n_classes > 1:
+            if len(shape) == 2:
+              out.itemset((i, int(_access(data, sample))), 1.0)
+            else:
+              for idx, value in enumerate(_access(data, sample)):
+                out.itemset(tuple([i, idx, value]), 1.0)
+          else:
+            out[i] = _access(data, sample)
+      return out
+
    def _feed_dict_fn():
      """Function that samples data into given placeholders."""
      if self.max_epochs is not None and self.epoch + 1 > self.max_epochs:
@ -376,20 +481,19 @@ class DataFeeder(object):
        feed_dict[self._epoch_placeholder.name] = [self.epoch]

      # Take next batch of indices.
-      end = min(self._x.shape[0], self.offset + self._batch_size)
+      x_len = list(self._x.values())[0].shape[0] if x_is_dict else self._x.shape[0]
+      end = min(x_len, self.offset + self._batch_size)
      batch_indices = self.indices[self.offset:end]

-      # Assign input features from random indices.
-      inp = (
-          np.array(_access(self._x, batch_indices)).reshape(
-              (batch_indices.shape[0], 1))
-          if len(self._x.shape) == 1 else _access(self._x, batch_indices))
-      feed_dict[self._input_placeholder.name] = inp
+      # adding input placeholder
+      feed_dict.update(
+        dict([(self._input_placeholder[k].name, extract(v, batch_indices)) for k, v in list(self._x.items())])
+        if x_is_dict else {self._input_placeholder.name: extract(self._x, batch_indices)})

      # move offset and reset it if necessary
      self.offset += self._batch_size
-      if self.offset >= self._x.shape[0]:
-        self.indices = self.random_state.permutation(self._x.shape[0])
+      if self.offset >= x_len:
+        self.indices = self.random_state.permutation(x_len) if self._shuffle else np.array(range(x_len))
        self.offset = 0
        self.epoch += 1

@ -397,24 +501,18 @@ class DataFeeder(object):
      if self._output_placeholder is None:
        return feed_dict

-      # assign labels from random indices
-      self.output_shape[0] = batch_indices.shape[0]
-      out = np.zeros(self.output_shape, dtype=self._output_dtype)
-      for i in xrange(out.shape[0]):
-        sample = batch_indices[i]
-        # self.n_classes is None means we're passing in raw label indices
-        if self.n_classes is None:
-          out[i] = _access(self._y, sample)
-        else:
-          if self.n_classes > 1:
-            if len(self.output_shape) == 2:
-              out.itemset((i, int(_access(self._y, sample))), 1.0)
-            else:
-              for idx, value in enumerate(_access(self._y, sample)):
-                out.itemset(tuple([i, idx, value]), 1.0)
-          else:
-            out[i] = _access(self._y, sample)
-      feed_dict[self._output_placeholder.name] = out
+      # adding output placeholders
+      if y_is_dict:
+        for k, v in list(self._y.items()):
+          n_classes = (
+            self.n_classes[k] if k in self.n_classes else None) if self.n_classes is not None else None
+          shape, dtype = self.output_shape[k], self._output_dtype[k]
+          feed_dict.update(
+            {self._output_placeholder[k].name: assign_label(v, shape, dtype, n_classes, batch_indices)})
+      else:
+        shape, dtype, n_classes = self.output_shape, self._output_dtype, self.n_classes
+        feed_dict.update(
+          {self._output_placeholder.name: assign_label(self._y, shape, dtype, n_classes, batch_indices)})

      return feed_dict

@ -433,21 +531,29 @@ class StreamingDataFeeder(DataFeeder):
    """Initializes a StreamingDataFeeder instance.

    Args:
-      x: iterator that returns for each element, returns features.
-      y: iterator that returns for each element, returns 1 or many classes /
-         regression values.
-      n_classes: indicator of how many classes the label has.
-      batch_size: Mini batch size to accumulate.
+      x: iterator each element of which returns one feature sample. Sample can
+        be a Nd numpy matrix or dictionary of Nd numpy matrices.
+      y: iterator each element of which returns one label sample. Sample can be
+        a Nd numpy matrix or dictionary of Nd numpy matrices with 1 or many
+        classes regression values.
+      n_classes: indicator of how many classes the corresponding label sample
+        has for the purposes of one-hot conversion of label. In case where `y`
+        is a dictionary, `n_classes` must be dictionary (with same keys as `y`)
+        of how many classes there are in each label in `y`. If key is
+        present in `y` and missing in `n_classes`, the value is assumed `None`
+        and no one-hot conversion will be applied to the label with that key.
+      batch_size: Mini batch size to accumulate samples in one batch. If set
+        `None`, then assumes that iterator to return already batched element.

    Attributes:
-      x: input features.
-      y: input label.
+      x: input features (or dictionary of input features).
+      y: input label (or dictionary of output features).
      n_classes: number of classes.
      batch_size: mini batch size to accumulate.
-      input_shape: shape of the input.
-      output_shape: shape of the output.
-      input_dtype: dtype of input.
-      output_dtype: dtype of output.
+      input_shape: shape of the input (can be dictionary depending on `x`).
+      output_shape: shape of the output (can be dictionary depending on `y`).
+      input_dtype: dtype of input (can be dictionary depending on `x`).
+      output_dtype: dtype of output (can be dictionary depending on `y`).
    """
    # pylint: disable=invalid-name,super-init-not-called
    x_first_el = six.next(x)
@ -459,25 +565,48 @@ class StreamingDataFeeder(DataFeeder):
      y_first_el = None
      self._y = None
    self.n_classes = n_classes
-    x_first_el = ops.convert_to_tensor(x_first_el)
-    y_first_el = ops.convert_to_tensor(y_first_el) if y is not None else None
-    self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
-        [1] + list(x_first_el.get_shape()),
-        [1] + list(y_first_el.get_shape()) if y is not None else None,
-        n_classes,
-        batch_size)
-    self._input_dtype = _check_dtype(x_first_el.dtype).as_numpy_dtype
+
+    x_is_dict, y_is_dict = isinstance(x_first_el, dict), y is not None and isinstance(y_first_el, dict)
+    if y_is_dict and n_classes is not None:
+      assert (isinstance(n_classes, dict))
+
+    # extract shapes for first_elements
+    x_first_el_shape = dict([(k, [1] + list(v.shape)) for k, v in list(x_first_el.items())]) if x_is_dict \
+      else [1] + list(x_first_el.shape)
+
+    y_first_el_shape = dict([(k, [1] + list(v.shape)) for k, v in list(y_first_el.items())]) if y_is_dict \
+      else ([1] + list(y_first_el[0].shape if isinstance(y_first_el, list) else y_first_el.shape)
+            if y is not None else None)
+
+    self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(x_first_el_shape, y_first_el_shape,
+                                                                              n_classes, batch_size)
+
+    # Input dtype of x_first_el.
+    self._input_dtype = dict([(k, _check_dtype(v.dtype)) for k, v in list(x_first_el.items())]) if x_is_dict \
+      else _check_dtype(x_first_el.dtype)
+
+    # Output dtype of y_first_el.
+    def check_y_dtype(el):
+      if isinstance(el, list) or isinstance(el, np.ndarray):
+        if isinstance(el, np.ndarray) and el.ndim == 0:
+          return el.dtype
+        else:
+          return _check_dtype(np.dtype(type(el[0])))
+      else:
+        return _check_dtype(np.dtype(type(el)))
+
    # Output types are floats, due to both softmaxes and regression req.
-    if n_classes is not None and n_classes > 0:
+    if n_classes is not None and (y is None or not y_is_dict) and n_classes > 0:
      self._output_dtype = np.float32
-    elif y is not None:
-      self._output_dtype = _check_dtype(y_first_el.dtype).as_numpy_dtype
+    else:
+      self._output_dtype = dict([(k, check_y_dtype(v)) for k, v in list(y_first_el.items())]) if y_is_dict \
+        else (check_y_dtype(y_first_el) if y is not None else None)

  def get_feed_params(self):
-    """Function returns a dict with data feed params while training.
+    """Function returns a `dict` with data feed params while training.

    Returns:
-      A dict with data feed params while training.
+      A `dict` with data feed params while training.
    """
    return {'batch_size': self._batch_size}

@ -494,50 +623,76 @@ class StreamingDataFeeder(DataFeeder):
      """Samples data and provides it to placeholders.

      Returns:
-        Dict of input and output tensors.
+        `dict` of input and output tensors.
      """
+
+      def init_array(shape, dtype):
+        if shape is None:
+          return None
+        else:
+          return dict([(k, np.zeros(shape[k], dtype[k])) for k in list(shape.keys())]) if isinstance(shape, dict) else \
+            np.zeros(shape, dtype=dtype)
+
+      def put_data_array(dest, index, source=None, n_classes=None):
+        if source is None:
+          dest = dest[:index, :]
+        elif n_classes is not None and n_classes > 1:
+          if len(self.output_shape) == 2:
+            dest.itemset((index, source), 1.0)
+          else:
+            for idx, value in enumerate(source):
+              dest.itemset(tuple([index, idx, value]), 1.0)
+        else:
+          if len(dest.shape) > 1:
+            dest[index, :] = source
+          else:
+            dest[index] = source[0] if isinstance(source, list) else source
+        return dest
+
+      def put_data_array_or_dict(holder, index, data=None, n_classes=None):
+        if holder is None:
+          return None
+        if isinstance(holder, dict):
+          assert (isinstance(data, dict))
+          for k, v in list(holder.items()):
+            num_classes = n_classes[k] if (n_classes is not None and k in n_classes) else None
+            holder[k] = put_data_array(holder[k], index, data[k], num_classes)
+        else:
+          holder = put_data_array(holder, index, data, n_classes)
+        return holder
+
      if self.stopped:
        raise StopIteration
-      try:
-        inp = np.zeros(self.input_shape, dtype=self._input_dtype)
-      except TypeError as exc:
-        raise TypeError('Unrecognized dtype: {}. {}'.format(
-            self._input_dtype, exc))
-      if self._y is not None:
-        out = np.zeros(self.output_shape, dtype=self._output_dtype)
+
+      inp = init_array(self.input_shape, self._input_dtype)
+      out = init_array(self.output_shape, self._output_dtype)
+
      for i in xrange(self._batch_size):
        # Add handling when queue ends.
        try:
-          inp[i, :] = six.next(self._x)
+          next_inp = six.next(self._x)
+          inp = put_data_array_or_dict(inp, i, next_inp, None)
        except StopIteration:
          self.stopped = True
          if i == 0:
            raise
-          inp = inp[:i, :]
-          if self._y is not None:
-            out = out[:i]
+          inp = put_data_array_or_dict(inp, i, None, None)
+          out = put_data_array_or_dict(out, i, None, None)
          break

        if self._y is not None:
-          y = six.next(self._y)
-          if self.n_classes is not None and self.n_classes > 1:
-            if len(self.output_shape) == 2:
-              out.itemset((i, y), 1.0)
-            else:
-              for idx, value in enumerate(y):
-                out.itemset(tuple([i, idx, value]), 1.0)
-          else:
-            # The y itertor can sometimes return scalars or singleton lists.
-            try:
-              out[i] = y
-            except ValueError as _:
-              assert len(y) == 1, ('Expected singleton label, got {}'
-                                   .format(repr(y)))
-              out[i] = y[0]
-      if self._y is None:
-        return {self._input_placeholder.name: inp}
-      return {self._input_placeholder.name: inp,
-              self._output_placeholder.name: out}
+          next_out = six.next(self._y)
+          out = put_data_array_or_dict(out, i, next_out, self.n_classes)
+
+      # creating feed_dict
+      feed_dict = dict([(self._input_placeholder[k].name, inp[k]) for k in list(self._input_placeholder.keys())]) if \
+        isinstance(inp, dict) else {self._input_placeholder.name: inp}
+      if self._y is not None:
+        feed_dict.update(
+          dict([(self._output_placeholder[k].name, out[k]) for k in list(self._output_placeholder.keys())]) \
+            if isinstance(out, dict) else {self._output_placeholder.name: out})
+
+      return feed_dict

    return _feed_dict_fn

@ -575,6 +730,10 @@ class DaskDataFeeder(object):
      input_dtype: dtype of input.
      output_dtype: dtype of output.
    """
+
+    if isinstance(x, dict) or isinstance(y, dict):
+      raise ValueError("DaskDataFeeder does not support dictionaries at the moment.")
+
    # pylint: disable=invalid-name,super-init-not-called
    import dask.dataframe as dd  # pylint: disable=g-import-not-at-top
    # TODO(terrytangyuan): check x and y dtypes in dask_io like pandas
@ -601,7 +760,7 @@ class DaskDataFeeder(object):
    self._shuffle = shuffle
    self.epochs = epochs
    self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
-        x_shape, y_shape, n_classes, batch_size)
+      x_shape, y_shape, n_classes, batch_size)
    self.sample_fraction = self._batch_size / float(x_count)
    self._input_dtype = _check_dtype(self._x.dtypes[0])
    self._output_dtype = _check_dtype(self._y.dtypes[self._y_columns])
@ -611,10 +770,10 @@ class DaskDataFeeder(object):
      self.random_state = random_state

  def get_feed_params(self):
-    """Function returns a dict with data feed params while training.
+    """Function returns a `dict` with data feed params while training.

    Returns:
-      A dict with data feed params while training.
+      A `dict` with data feed params while training.
    """
    return {'batch_size': self._batch_size}

@ -629,13 +788,14 @@ class DaskDataFeeder(object):
      A function that when called samples a random subset of batch size
      from x and y.
    """
+
    def _feed_dict_fn():
      """Samples data and provides it to placeholders."""
      # TODO(ipolosukhin): option for with/without replacement (dev version of
      # dask)
      sample = self.df.random_split(
-          [self.sample_fraction, 1 - self.sample_fraction],
-          random_state=self.random_state)
+        [self.sample_fraction, 1 - self.sample_fraction],
+        random_state=self.random_state)
      inp = extract_pandas_matrix(sample[0][self._x_columns].compute()).tolist()
      out = extract_pandas_matrix(sample[0][self._y_columns].compute())
      # convert to correct dtype
@ -650,4 +810,5 @@ class DaskDataFeeder(object):
      encoded_out[np.arange(out.size), out] = 1
      return {input_placeholder.name: inp,
              output_placeholder.name: encoded_out}
+
    return _feed_dict_fn
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder_test.py
@ -32,150 +32,200 @@ class DataFeederTest(tf.test.TestCase):
  # pylint: disable=undefined-variable
  """Tests for `DataFeeder`."""

+  def _wrap_dict(self, data, prepend=''):
+    return {prepend+'1': data, prepend+'2': data}
+
  def _assert_raises(self, input_data):
    with self.assertRaisesRegexp(TypeError, 'annot convert'):
      data_feeder.DataFeeder(input_data, None, n_classes=0, batch_size=1)

  def test_input_uint32(self):
-    self._assert_raises(np.matrix([[1, 2], [3, 4]], dtype=np.uint32))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.uint32)
+    self._assert_raises(data)
+    self._assert_raises(self._wrap_dict(data))

  def test_input_uint64(self):
-    self._assert_raises(np.matrix([[1, 2], [3, 4]], dtype=np.uint64))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.uint64)
+    self._assert_raises(data)
+    self._assert_raises(self._wrap_dict(data))

  def _assert_dtype(self, expected_np_dtype, expected_tf_dtype, input_data):
    feeder = data_feeder.DataFeeder(input_data, None, n_classes=0, batch_size=1)
-    self.assertEqual(expected_np_dtype, feeder.input_dtype)
+    if isinstance(input_data, dict):
+      for k, v in list(feeder.input_dtype.items()):
+        self.assertEqual(expected_np_dtype, v)
+    else:
+      self.assertEqual(expected_np_dtype, feeder.input_dtype)
    with tf.Graph().as_default() as g, self.test_session(g):
      inp, _ = feeder.input_builder()
-      self.assertEqual(expected_tf_dtype, inp.dtype)
+      if isinstance(inp, dict):
+        for k, v in list(inp.items()):
+          self.assertEqual(expected_tf_dtype, v.dtype)
+      else:
+        self.assertEqual(expected_tf_dtype, inp.dtype)

  def test_input_int8(self):
-    self._assert_dtype(
-        np.int8, tf.int8, np.matrix([[1, 2], [3, 4]], dtype=np.int8))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.int8)
+    self._assert_dtype(np.int8, tf.int8, data)
+    self._assert_dtype(np.int8, tf.int8, self._wrap_dict(data))

  def test_input_int16(self):
-    self._assert_dtype(
-        np.int16, tf.int16, np.matrix([[1, 2], [3, 4]], dtype=np.int16))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.int16)
+    self._assert_dtype(np.int16, tf.int16, data)
+    self._assert_dtype(np.int16, tf.int16, self._wrap_dict(data))

  def test_input_int32(self):
-    self._assert_dtype(
-        np.int32, tf.int32, np.matrix([[1, 2], [3, 4]], dtype=np.int32))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.int32)
+    self._assert_dtype(np.int32, tf.int32, data)
+    self._assert_dtype(np.int32, tf.int32, self._wrap_dict(data))

  def test_input_int64(self):
-    self._assert_dtype(
-        np.int64, tf.int64, np.matrix([[1, 2], [3, 4]], dtype=np.int64))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.int64)
+    self._assert_dtype(np.int64, tf.int64, data)
+    self._assert_dtype(np.int64, tf.int64, self._wrap_dict(data))

  def test_input_uint8(self):
-    self._assert_dtype(
-        np.uint8, tf.uint8, np.matrix([[1, 2], [3, 4]], dtype=np.uint8))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.uint8)
+    self._assert_dtype(np.uint8, tf.uint8, data)
+    self._assert_dtype(np.uint8, tf.uint8, self._wrap_dict(data))

  def test_input_uint16(self):
-    self._assert_dtype(
-        np.uint16, tf.uint16, np.matrix([[1, 2], [3, 4]], dtype=np.uint16))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.uint16)
+    self._assert_dtype(np.uint16, tf.uint16, data)
+    self._assert_dtype(np.uint16, tf.uint16, self._wrap_dict(data))

  def test_input_float16(self):
-    self._assert_dtype(
-        np.float16, tf.float16, np.matrix([[1, 2], [3, 4]], dtype=np.float16))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.float16)
+    self._assert_dtype(np.float16, tf.float16, data)
+    self._assert_dtype(np.float16, tf.float16, self._wrap_dict(data))

  def test_input_float32(self):
-    self._assert_dtype(
-        np.float32, tf.float32, np.matrix([[1, 2], [3, 4]], dtype=np.float32))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.float32)
+    self._assert_dtype(np.float32, tf.float32, data)
+    self._assert_dtype(np.float32, tf.float32, self._wrap_dict(data))

  def test_input_float64(self):
-    self._assert_dtype(
-        np.float64, tf.float64, np.matrix([[1, 2], [3, 4]], dtype=np.float64))
+    data = np.matrix([[1, 2], [3, 4]], dtype=np.float64)
+    self._assert_dtype(np.float64, tf.float64, data)
+    self._assert_dtype(np.float64, tf.float64, self._wrap_dict(data))

  def test_input_bool(self):
-    self._assert_dtype(
-        np.bool, tf.bool,
-        np.array([[False for _ in xrange(2)] for _ in xrange(2)]))
+    data = np.array([[False for _ in xrange(2)] for _ in xrange(2)])
+    self._assert_dtype(np.bool, tf.bool, data)
+    self._assert_dtype(np.bool, tf.bool, self._wrap_dict(data))

  def test_input_string(self):
    input_data = np.array([['str%d' % i for i in xrange(2)] for _ in xrange(2)])
    self._assert_dtype(input_data.dtype, tf.string, input_data)
+    self._assert_dtype(input_data.dtype, tf.string, self._wrap_dict(input_data))
+
+  def _assertAllClose(self, src, dest, src_key_of=None, src_prop=None):
+    def func(x):
+      val = getattr(x, src_prop) if src_prop else x
+      return val if src_key_of is None else src_key_of[val]
+    if isinstance(src, dict):
+      for k in list(src.keys()):
+        self.assertAllClose(func(src[k]), dest)
+    else:
+      self.assertAllClose(func(src), dest)

  def test_unsupervised(self):
+    def func(feeder):
+      with self.test_session():
+        inp, _ = feeder.input_builder()
+        feed_dict_fn = feeder.get_feed_dict_fn()
+        feed_dict = feed_dict_fn()
+        self._assertAllClose(inp, [[1, 2]], feed_dict, 'name')
    data = np.matrix([[1, 2], [2, 3], [3, 4]])
-    feeder = data_feeder.DataFeeder(data, None, n_classes=0, batch_size=1)
-    with self.test_session():
-      inp, _ = feeder.input_builder()
-      feed_dict_fn = feeder.get_feed_dict_fn()
-      feed_dict = feed_dict_fn()
-      self.assertAllClose(feed_dict[inp.name], [[1, 2]])
+    func(data_feeder.DataFeeder(data, None, n_classes=0, batch_size=1))
+    func(data_feeder.DataFeeder(self._wrap_dict(data), None, n_classes=0, batch_size=1))

  def test_data_feeder_regression(self):
+    def func(df):
+      inp, out = df.input_builder()
+      feed_dict_fn = df.get_feed_dict_fn()
+      feed_dict = feed_dict_fn()
+      self._assertAllClose(inp, [[3, 4], [1, 2]], feed_dict, 'name')
+      self._assertAllClose(out, [2, 1], feed_dict, 'name')
    x = np.matrix([[1, 2], [3, 4]])
    y = np.array([1, 2])
-    df = data_feeder.DataFeeder(x, y, n_classes=0, batch_size=3)
-    inp, out = df.input_builder()
-    feed_dict_fn = df.get_feed_dict_fn()
-    feed_dict = feed_dict_fn()
-
-    self.assertAllClose(feed_dict[inp.name], [[3, 4], [1, 2]])
-    self.assertAllClose(feed_dict[out.name], [2, 1])
+    func(data_feeder.DataFeeder(x, y, n_classes=0, batch_size=3))
+    func(data_feeder.DataFeeder(self._wrap_dict(x, 'in'), self._wrap_dict(y, 'out'),
+                                n_classes=self._wrap_dict(0, 'out'), batch_size=3))

  def test_epoch(self):
+    def func(feeder):
+      with self.test_session():
+        feeder.input_builder()
+        epoch = feeder.make_epoch_variable()
+        feed_dict_fn = feeder.get_feed_dict_fn()
+        # First input
+        feed_dict = feed_dict_fn()
+        self.assertAllClose(feed_dict[epoch.name], [0])
+        # Second input
+        feed_dict = feed_dict_fn()
+        self.assertAllClose(feed_dict[epoch.name], [0])
+        # Third input
+        feed_dict = feed_dict_fn()
+        self.assertAllClose(feed_dict[epoch.name], [0])
+        # Back to the first input again, so new epoch.
+        feed_dict = feed_dict_fn()
+        self.assertAllClose(feed_dict[epoch.name], [1])
    data = np.matrix([[1, 2], [2, 3], [3, 4]])
    labels = np.array([0, 0, 1])
-    feeder = data_feeder.DataFeeder(data, labels, n_classes=0, batch_size=1)
-    with self.test_session():
-      feeder.input_builder()
-      epoch = feeder.make_epoch_variable()
-      feed_dict_fn = feeder.get_feed_dict_fn()
-      # First input
-      feed_dict = feed_dict_fn()
-      self.assertAllClose(feed_dict[epoch.name], [0])
-      # Second input
-      feed_dict = feed_dict_fn()
-      self.assertAllClose(feed_dict[epoch.name], [0])
-      # Third input
-      feed_dict = feed_dict_fn()
-      self.assertAllClose(feed_dict[epoch.name], [0])
-      # Back to the first input again, so new epoch.
-      feed_dict = feed_dict_fn()
-      self.assertAllClose(feed_dict[epoch.name], [1])
+    func(data_feeder.DataFeeder(data, labels, n_classes=0, batch_size=1))
+    func(data_feeder.DataFeeder(self._wrap_dict(data, 'in'), self._wrap_dict(labels, 'out'),
+                                n_classes=self._wrap_dict(0, 'out'), batch_size=1))

  def test_data_feeder_multioutput_regression(self):
+    def func(df):
+      inp, out = df.input_builder()
+      feed_dict_fn = df.get_feed_dict_fn()
+      feed_dict = feed_dict_fn()
+      self._assertAllClose(inp, [[3, 4], [1, 2]], feed_dict, 'name')
+      self._assertAllClose(out, [[3, 4], [1, 2]], feed_dict, 'name')
    x = np.matrix([[1, 2], [3, 4]])
    y = np.array([[1, 2], [3, 4]])
-    df = data_feeder.DataFeeder(x, y, n_classes=0, batch_size=2)
-    inp, out = df.input_builder()
-    feed_dict_fn = df.get_feed_dict_fn()
-    feed_dict = feed_dict_fn()
-    self.assertAllClose(feed_dict[inp.name], [[3, 4], [1, 2]])
-    self.assertAllClose(feed_dict[out.name], [[3, 4], [1, 2]])
+    func(data_feeder.DataFeeder(x, y, n_classes=0, batch_size=2))
+    func(data_feeder.DataFeeder(self._wrap_dict(x, 'in'), self._wrap_dict(y, 'out'),
+                                n_classes=self._wrap_dict(0, 'out'), batch_size=2))

  def test_data_feeder_multioutput_classification(self):
+    def func(df):
+      inp, out = df.input_builder()
+      feed_dict_fn = df.get_feed_dict_fn()
+      feed_dict = feed_dict_fn()
+      self._assertAllClose(inp, [[3, 4], [1, 2]], feed_dict, 'name')
+      self._assertAllClose(out,
+                          [[[0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]],
+                           [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0, 0]]],
+                          feed_dict, 'name')
+
    x = np.matrix([[1, 2], [3, 4]])
    y = np.array([[0, 1, 2], [2, 3, 4]])
-    df = data_feeder.DataFeeder(x, y, n_classes=5, batch_size=2)
-    inp, out = df.input_builder()
-    feed_dict_fn = df.get_feed_dict_fn()
-    feed_dict = feed_dict_fn()
-    self.assertAllClose(feed_dict[inp.name], [[3, 4], [1, 2]])
-    self.assertAllClose(feed_dict[out.name],
-                        [[[0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]],
-                         [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0, 0]]])
+    func(data_feeder.DataFeeder(x, y, n_classes=5, batch_size=2))
+    func(data_feeder.DataFeeder(self._wrap_dict(x, 'in'), self._wrap_dict(y, 'out'),
+                                n_classes=self._wrap_dict(5, 'out'), batch_size=2))

  def test_streaming_data_feeder(self):
+    def func(df):
+      inp, out = df.input_builder()
+      feed_dict_fn = df.get_feed_dict_fn()
+      feed_dict = feed_dict_fn()
+      self._assertAllClose(inp, [[1, 2], [3, 4]], feed_dict, 'name')
+      self._assertAllClose(out, [1, 2], feed_dict, 'name' )

-    def x_iter():
-      yield np.array([1, 2])
-      yield np.array([3, 4])
+    def x_iter(wrap_dict=False):
+      yield np.array([1, 2]) if not wrap_dict else self._wrap_dict(np.array([1, 2]), 'in')
+      yield np.array([3, 4]) if not wrap_dict else self._wrap_dict(np.array([3, 4]), 'in')

-    def y_iter():
-      yield np.array([1])
-      yield np.array([2])
+    def y_iter(wrap_dict=False):
+      yield np.array([1]) if not wrap_dict else self._wrap_dict(np.array([1]), 'out')
+      yield np.array([2]) if not wrap_dict else self._wrap_dict(np.array([2]), 'out')

-    df = data_feeder.StreamingDataFeeder(x_iter(),
-                                         y_iter(),
-                                         n_classes=0,
-                                         batch_size=2)
-    inp, out = df.input_builder()
-    feed_dict_fn = df.get_feed_dict_fn()
-    feed_dict = feed_dict_fn()
-    self.assertAllClose(feed_dict[inp.name], [[1, 2], [3, 4]])
-    self.assertAllClose(feed_dict[out.name], [1, 2])
+    func(data_feeder.StreamingDataFeeder(x_iter(), y_iter(), n_classes=0, batch_size=2))
+    func(data_feeder.StreamingDataFeeder(x_iter(True), y_iter(True),
+                                         n_classes=self._wrap_dict(0, 'out'), batch_size=2))

  def test_dask_data_feeder(self):
    if HAS_PANDAS and HAS_DASK:
@ -196,6 +246,13 @@ class DataFeederTest(tf.test.TestCase):
      self.assertAllClose(feed_dict[out.name], [[0., 0., 1.], [0., 1., 0.]])

  def test_hdf5_data_feeder(self):
+    def func(df):
+      inp, out = df.input_builder()
+      feed_dict_fn = df.get_feed_dict_fn()
+      feed_dict = feed_dict_fn()
+      self._assertAllClose(inp, [[3, 4], [1, 2]], feed_dict, 'name')
+      self.assertAllClose(out, [2, 1], feed_dict, 'name')
+
    try:
      import h5py  # pylint: disable=g-import-not-at-top
      x = np.matrix([[1, 2], [3, 4]])
@ -207,25 +264,28 @@ class DataFeederTest(tf.test.TestCase):
      h5f = h5py.File('test_hdf5.h5', 'r')
      x = h5f['x']
      y = h5f['y']
-      df = data_feeder.DataFeeder(x, y, n_classes=0, batch_size=3)
-      inp, out = df.input_builder()
-      feed_dict_fn = df.get_feed_dict_fn()
-      feed_dict = feed_dict_fn()
-      self.assertAllClose(feed_dict[inp.name], [[3, 4], [1, 2]])
-      self.assertAllClose(feed_dict[out.name], [2, 1])
+      func(data_feeder.DataFeeder(x, y, n_classes=0, batch_size=3))
+      func(data_feeder.DataFeeder(self._wrap_dict(x, 'in'), self._wrap_dict(y, 'out'),
+                                  n_classes=self._wrap_dict(0, 'out'), batch_size=3))
    except ImportError:
      print("Skipped test for hdf5 since it's not installed.")


-class SetupPredictDataFeederTest(tf.test.TestCase):
+class SetupPredictDataFeederTest(DataFeederTest):
  """Tests for `DataFeeder.setup_predict_data_feeder`."""

  def test_iterable_data(self):
    # pylint: disable=undefined-variable
-    x = iter([[1, 2], [3, 4], [5, 6]])
-    df = data_feeder.setup_predict_data_feeder(x, batch_size=2)
-    self.assertAllClose(six.next(df), [[1, 2], [3, 4]])
-    self.assertAllClose(six.next(df), [[5, 6]])
+
+    def func(df):
+      self._assertAllClose(six.next(df), [[1, 2], [3, 4]])
+      self._assertAllClose(six.next(df), [[5, 6]])
+
+    data = [[1, 2], [3, 4], [5, 6]]
+    x = iter(data)
+    x_dict = iter([self._wrap_dict(v) for v in iter(data)])
+    func(data_feeder.setup_predict_data_feeder(x, batch_size=2))
+    func(data_feeder.setup_predict_data_feeder(x_dict, batch_size=2))


 if __name__ == '__main__':
--- a/tensorflow/contrib/learn/python/learn/trainable.py
+++ b/tensorflow/contrib/learn/python/learn/trainable.py
@ -33,17 +33,17 @@ class Trainable(object):
    """Trains a model given training data `x` predictions and `y` labels.

    Args:
-      x: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
-      y: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-         iterator that returns array of labels. The training label values
-         (class labels in classification, real numbers in regression). If set,
-         `input_fn` must be `None`. Note: For classification, label values must
+      x: Matrix of shape [n_samples, n_features...] or the dictionary of Matrices.
+         Can be iterator that returns arrays of features or dictionary of arrays of features.
+         The training input samples for fitting the model. If set, `input_fn` must be `None`.
+      y: Vector or matrix [n_samples] or [n_samples, n_outputs] or the dictionary of same.
+         Can be iterator that returns array of labels or dictionary of array of labels.
+         The training label values (class labels in classification, real numbers in regression).
+         If set, `input_fn` must be `None`. Note: For classification, label values must
         be integers representing the class index (i.e. values from 0 to
         n_classes-1).
      input_fn: Input function returning a tuple of:
-          features - Dictionary of string feature name to `Tensor` or `Tensor`.
+          features - `Tensor` or dictionary of string feature name to `Tensor`.
          labels - `Tensor` or dictionary of `Tensor` with labels.
        If input_fn is set, `x`, `y`, and `batch_size` must be `None`.
      steps: Number of steps for which to train model. If `None`, train forever.
@ -67,4 +67,3 @@ class Trainable(object):
      `self`, for chaining.
    """
    raise NotImplementedError
-
--- a/tensorflow/contrib/learn/python/learn/utils/init.py
+++ b/tensorflow/contrib/learn/python/learn/utils/init.py
@ -19,5 +19,4 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from tensorflow.contrib.learn.python.learn.utils import checkpoints
 from tensorflow.contrib.learn.python.learn.utils.export import export_estimator
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@ -142,10 +142,11 @@ xcode-select --install
 If this is a new install, you will need to run XCode once to agree to the
 license before continuing.

-Then install [automake](https://en.wikipedia.org/wiki/Automake):
+Then install [automake](https://en.wikipedia.org/wiki/Automake)/[libtool](https://en.wikipedia.org/wiki/GNU_Libtool):

 ```bash
 brew install automake
+brew install libtool
 ```

 Also, download the graph if you haven't already:
--- a/tensorflow/contrib/makefile/compile_ios_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
@ -67,7 +67,6 @@ fi

 make distclean
 ./configure \
--build=x86_64-apple-${OSX_VERSION} \
 --host=i386-apple-${OSX_VERSION} \
 --disable-shared \
 --enable-cross-compile \
@ -95,7 +94,6 @@ make install

 make distclean
 ./configure \
--build=x86_64-apple-${OSX_VERSION} \
 --host=x86_64-apple-${OSX_VERSION} \
 --disable-shared \
 --enable-cross-compile \
@ -123,7 +121,6 @@ make install

 make distclean
 ./configure \
--build=x86_64-apple-${OSX_VERSION} \
 --host=armv7-apple-${OSX_VERSION} \
 --with-protoc="${PROTOC_PATH}" \
 --disable-shared \
@ -147,7 +144,6 @@ make install

 make distclean
 ./configure \
--build=x86_64-apple-${OSX_VERSION} \
 --host=armv7s-apple-${OSX_VERSION} \
 --with-protoc="${PROTOC_PATH}" \
 --disable-shared \
@ -171,7 +167,6 @@ make install

 make distclean
 ./configure \
--build=x86_64-apple-${OSX_VERSION} \
 --host=arm \
 --with-protoc="${PROTOC_PATH}" \
 --disable-shared \
--- a/tensorflow/contrib/metrics/python/ops/set_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/set_ops.py
@ -19,7 +19,6 @@ from __future__ import print_function

 from tensorflow.python.ops import sets

-
 set_size = sets.set_size

 set_intersection = sets.set_intersection
--- a/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
@ -158,7 +158,7 @@ class GRUBlockCellTest(tf.test.TestCase):
        output = gru_ops.GRUBlockCell(cell_size)(x, h)
        sess.run([tf.global_variables_initializer()])

-        all_variables = tf.all_variables()[0:4]
+        all_variables = tf.global_variables()[0:4]
        [w_ru, b_ru, w_c, b_c] = all_variables

        d_new_h_wrt_x = tf.gradients([output], x)
@ -178,7 +178,7 @@ class GRUBlockCellTest(tf.test.TestCase):
        output = tf.contrib.rnn.GRUCell(cell_size)(x, h)
        sess.run([tf.global_variables_initializer()])

-        all_variables = tf.all_variables()[4:8]
+        all_variables = tf.global_variables()[4:8]
        [w_ru, b_ru, w_c, b_c] = all_variables

        d_new_h_wrt_x = tf.gradients([output], x)
@ -281,7 +281,7 @@ class GRUBlockCellTest(tf.test.TestCase):

      sess.run([tf.global_variables_initializer()])

-      all_variables = tf.all_variables()
+      all_variables = tf.global_variables()

      [w_ru, b_ru, w_c, b_c] = all_variables[:4]

--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
@ -382,7 +382,7 @@ class StackBidirectionalRNNTest(tf.test.TestCase):

      # check that all the variables names starts with the proper scope.
      tf.global_variables_initializer()
-      all_vars = tf.all_variables()
+      all_vars = tf.global_variables()
      prefix = prefix or "stack_bidirectional_rnn"
      scope_vars = [v for v in all_vars if v.name.startswith(prefix + "/")]
      tf.logging.info("StackRNN with scope: %s (%s)"
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@ -16,9 +16,9 @@ py_library(
 )

 cuda_py_test(
-    name = "layers_test",
+    name = "decoder_fn_test",
    size = "medium",
-    srcs = ["python/kernel_tests/layers_test.py"],
+    srcs = ["python/kernel_tests/decoder_fn_test.py"],
    additional_deps = [
        ":seq2seq_py",
        "//tensorflow:tensorflow_py",
@ -28,9 +28,9 @@ cuda_py_test(
 )

 cuda_py_test(
-    name = "loss_test",
+    name = "seq2seq_test",
    size = "medium",
-    srcs = ["python/kernel_tests/loss_test.py"],
+    srcs = ["python/kernel_tests/seq2seq_test.py"],
    additional_deps = [
        ":seq2seq_py",
        "//tensorflow:tensorflow_py",
--- a/tensorflow/contrib/seq2seq/init.py
+++ b/tensorflow/contrib/seq2seq/init.py
@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================

-"""Ops for building neural network seq2seq layers and losses."""
+"""Ops for building neural network seq2seq decoders and losses."""

 from __future__ import absolute_import
 from __future__ import division
@ -22,6 +22,7 @@ from __future__ import print_function
 import sys

 # pylint: disable=unused-import,line-too-long
-from tensorflow.contrib.seq2seq.python.ops import layers
-from tensorflow.contrib.seq2seq.python.ops import loss
+from tensorflow.contrib.seq2seq.python.ops.decoder_fn import *
+from tensorflow.contrib.seq2seq.python.ops.loss import *
+from tensorflow.contrib.seq2seq.python.ops.seq2seq import *
 # pylint: enable=unused-import,line-too-long
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/init.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/init.py
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_fn_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_fn_test.py
@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================

-"""Tests for contrib.seq2seq.python.seq2seq.layers_ops."""
+"""Tests for contrib.seq2seq.python.seq2seq.loss_ops."""
 # pylint: disable=unused-import,g-bad-import-order
 from __future__ import absolute_import
 from __future__ import division
@ -23,12 +23,9 @@ from __future__ import print_function
 import tensorflow as tf


-class LayersTest(tf.test.TestCase):
+class DecoderFnTest(tf.test.TestCase):

-  def testRNNDecoder(self):
-    pass
-
-  def testRNNDecoderAttention(self):
+  def testDecoderFn(self):
    pass


--- a/tensorflow/contrib/seq2seq/python/kernel_tests/seq2seq_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/seq2seq_test.py
@ -0,0 +1,129 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for contrib.seq2seq.python.ops.seq2seq."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import tensorflow as tf
+from tensorflow.contrib import layers
+
+class Seq2SeqTest(tf.test.TestCase):
+
+  # test a default call of rnn_decoder
+  def test_rnn_decoder(self):
+    pass
+
+  # test default call with time_major=True
+  def test_dynamic_rnn_decoder_time_major(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=
+                             tf.constant_initializer(0.5)) as varscope:
+        # Define inputs/outputs to model
+        batch_size = 2
+        encoder_embedding_size = 3
+        decoder_embedding_size = 4
+        encoder_hidden_size = 5
+        decoder_hidden_size = encoder_hidden_size
+        input_sequence_length = 6
+        decoder_sequence_length = 7
+        num_decoder_symbols = 20
+        start_of_sequence_id = end_of_sequence_id = 1
+        decoder_embeddings = tf.get_variable('decoder_embeddings',
+            [num_decoder_symbols, decoder_embedding_size],
+            initializer=tf.random_normal_initializer(stddev=0.1))
+        inputs = tf.constant(0.5, shape=[input_sequence_length, batch_size,
+                                         encoder_embedding_size])
+        decoder_inputs = tf.constant(0.4, shape=[decoder_sequence_length,
+                                                 batch_size,
+                                                 decoder_embedding_size])
+        decoder_length = tf.constant(decoder_sequence_length, dtype=tf.int32,
+                                     shape=[batch_size,])
+        with tf.variable_scope("rnn") as scope:
+          # setting up weights for computing the final output
+          output_fn = lambda x: layers.linear(x, num_decoder_symbols,
+                                              scope=scope)
+
+          # Define model
+          encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
+              cell=tf.nn.rnn_cell.GRUCell(encoder_hidden_size), inputs=inputs,
+              dtype=tf.float32, time_major=True, scope=scope)
+
+
+        with tf.variable_scope("decoder") as scope:
+          # Train decoder
+          decoder_cell = tf.nn.rnn_cell.GRUCell(decoder_hidden_size)
+          decoder_fn_train = tf.contrib.seq2seq.simple_decoder_fn_train(
+              encoder_state=encoder_state)
+          decoder_outputs_train, decoder_state_train = (
+            tf.contrib.seq2seq.dynamic_rnn_decoder(
+                cell=decoder_cell,
+                decoder_fn=decoder_fn_train,
+                inputs=decoder_inputs,
+                sequence_length=decoder_length,
+                time_major=True,
+                scope=scope))
+          decoder_outputs_train = output_fn(decoder_outputs_train)
+
+          # Setup variable reuse
+          scope.reuse_variables()
+
+          # Inference decoder
+          decoder_fn_inference = (
+              tf.contrib.seq2seq.simple_decoder_fn_inference(
+                output_fn=output_fn,
+                encoder_state=encoder_state,
+                embeddings=decoder_embeddings,
+                start_of_sequence_id=start_of_sequence_id,
+                end_of_sequence_id=end_of_sequence_id,
+                #TODO: find out why it goes to +1
+                maximum_length=decoder_sequence_length-1,
+                num_decoder_symbols=num_decoder_symbols,
+                dtype=tf.int32))
+          decoder_outputs_inference, decoder_state_inference = (
+              tf.contrib.seq2seq.dynamic_rnn_decoder(
+                cell=decoder_cell,
+                decoder_fn=decoder_fn_inference,
+                time_major=True,
+                scope=scope))
+
+        # Run model
+        tf.global_variables_initializer().run()
+        decoder_outputs_train_res, decoder_state_train_res = sess.run(
+            [decoder_outputs_train, decoder_state_train])
+        decoder_outputs_inference_res, decoder_state_inference_res = sess.run(
+            [decoder_outputs_inference, decoder_state_inference])
+
+        # Assert outputs
+        self.assertEqual((decoder_sequence_length, batch_size,
+                          num_decoder_symbols),
+                         decoder_outputs_train_res.shape)
+        self.assertEqual((batch_size, num_decoder_symbols),
+                         decoder_outputs_inference_res.shape[1:3])
+        self.assertEqual((batch_size, decoder_hidden_size),
+                         decoder_state_train_res.shape)
+        self.assertEqual((batch_size, decoder_hidden_size),
+                         decoder_state_inference_res.shape)
+        # The dynamic decoder might end earlier than `maximal_length`
+        # under inference
+        true_value = (decoder_sequence_length>=
+                      decoder_state_inference_res.shape[0])
+        self.assertEqual((true_value), True)
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/tensorflow/contrib/seq2seq/python/ops/init.py
+++ b/tensorflow/contrib/seq2seq/python/ops/init.py
--- a/tensorflow/contrib/seq2seq/python/ops/decoder_fn.py
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder_fn.py
@ -0,0 +1,249 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Seq2seq loss operations for use in neural networks.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.util import nest
+
+__all__ = ["simple_decoder_fn_train",
+           "simple_decoder_fn_inference"]
+
+def simple_decoder_fn_train(encoder_state, name=None):
+  """ Simple decoder function for a sequence-to-sequence model used in the
+  `dynamic_rnn_decoder`.
+
+  The `simple_decoder_fn_train` is a simple training function for a
+  sequence-to-sequence model. It should be used when `dynamic_rnn_decoder` is
+  in the training mode.
+
+  The `simple_decoder_fn_train` is called with a set of the user arguments and
+  returns the `decoder_fn`, which can be passed to the `dynamic_rnn_decoder`,
+  such that
+
+  ```
+  dynamic_fn_train = simple_decoder_fn_train(encoder_state)
+  outputs_train, state_train = dynamic_rnn_decoder(
+      decoder_fn=dynamic_fn_train, ...)
+  ```
+
+  Further usage can be found in the `kernel_tests/seq2seq_test.py`.
+
+  Args:
+    encoder_state: The encoded state to initialize the `dynamic_rnn_decoder`.
+    name: (default: `None`) NameScope for the decoder function;
+      defaults to "simple_decoder_fn_train"
+
+  Returns:
+    A decoder function with the required interface of `dynamic_rnn_decoder`
+    intended for training.
+  """
+  with ops.name_scope(name, "simple_decoder_fn_train", [encoder_state]):
+    pass
+
+  def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
+    """ Decoder function used in the `dynamic_rnn_decoder` with the purpose of
+    training.
+
+    Args:
+      time: positive integer constant reflecting the current timestep.
+      cell_state: state of RNNCell.
+      cell_input: input provided by `dynamic_rnn_decoder`.
+      cell_output: output of RNNCell.
+      context_state: context state provided by `dynamic_rnn_decoder`.
+
+    Returns:
+      A tuple (done, next state, next input, emit output, next context state)
+      where:
+
+      done: `None`, which is used by the `dynamic_rnn_decoder` to indicate
+      that `sequence_lengths` in `dynamic_rnn_decoder` should be used.
+
+      next state: `cell_state`, this decoder function does not modify the
+      given state.
+
+      next input: `cell_input`, this decoder function does not modify the
+      given input. The input could be modified when applying e.g. attention.
+
+      emit output: `cell_output`, this decoder function does not modify the
+      given output.
+
+      next context state: `context_state`, this decoder function does not
+      modify the given context state. The context state could be modified when
+      applying e.g. beam search.
+  """
+    with ops.name_scope(name, "simple_decoder_fn_train",
+                        [time, cell_state, cell_input, cell_output,
+                         context_state]):
+      if cell_state is None:  # first call, return encoder_state
+        return (None, encoder_state, cell_input, cell_output, context_state)
+      else:
+        return (None, cell_state, cell_input, cell_output, context_state)
+  return decoder_fn
+
+
+def simple_decoder_fn_inference(output_fn, encoder_state, embeddings,
+                                start_of_sequence_id, end_of_sequence_id,
+                                maximum_length, num_decoder_symbols,
+                                dtype=dtypes.int32, name=None):
+  """ Simple decoder function for a sequence-to-sequence model used in the
+  `dynamic_rnn_decoder`.
+
+  The `simple_decoder_fn_inference` is a simple inference function for a
+  sequence-to-sequence model. It should be used when `dynamic_rnn_decoder` is
+  in the inference mode.
+
+  The `simple_decoder_fn_inference` is called with a set of the user arguments
+  and returns the `decoder_fn`, which can be passed to the
+  `dynamic_rnn_decoder`, such that
+
+  ```
+  dynamic_fn_inference = simple_decoder_fn_inference(...)
+  outputs_inference, state_inference = dynamic_rnn_decoder(
+      decoder_fn=dynamic_fn_inference, ...)
+  ```
+
+  Further usage can be found in the `kernel_tests/seq2seq_test.py`.
+
+  Args:
+    output_fn: An output function to project your `cell_output` onto class
+    logits.
+
+    An example of an output function;
+
+    ```
+      tf.variable_scope("decoder") as varscope
+        output_fn = lambda x: layers.linear(x, num_decoder_symbols,
+                                            scope=varscope)
+
+        outputs_train, state_train = seq2seq.dynamic_rnn_decoder(...)
+        logits_train = output_fn(outputs_train)
+
+        varscope.reuse_variables()
+        logits_inference, state_inference = seq2seq.dynamic_rnn_decoder(
+            output_fn=output_fn, ...)
+    ```
+
+    If `None` is supplied it will act as an identity function, which
+    might be wanted when using the RNNCell `OutputProjectionWrapper`.
+
+    encoder_state: The encoded state to initialize the `dynamic_rnn_decoder`.
+    embeddings: The embeddings matrix used for the decoder sized
+    `[num_decoder_symbols, embedding_size]`.
+    start_of_sequence_id: The start of sequence ID in the decoder embeddings.
+    end_of_sequence_id: The end of sequence ID in the decoder embeddings.
+    maximum_length: The maximum allowed of time steps to decode.
+    num_decoder_symbols: The number of classes to decode at each time step.
+    dtype: (default: `dtypes.int32`) The default data type to use when
+    handling integer objects.
+    name: (default: `None`) NameScope for the decoder function;
+      defaults to "simple_decoder_fn_inference"
+
+  Returns:
+    A decoder function with the required interface of `dynamic_rnn_decoder`
+    intended for inference.
+  """
+  with ops.name_scope(name, "simple_decoder_fn_inference",
+                      [output_fn, encoder_state, embeddings,
+                       start_of_sequence_id, end_of_sequence_id,
+                       maximum_length, num_decoder_symbols, dtype]):
+    start_of_sequence_id = ops.convert_to_tensor(start_of_sequence_id, dtype)
+    end_of_sequence_id = ops.convert_to_tensor(end_of_sequence_id, dtype)
+    maximum_length = ops.convert_to_tensor(maximum_length, dtype)
+    num_decoder_symbols = ops.convert_to_tensor(num_decoder_symbols, dtype)
+    encoder_info = nest.flatten(encoder_state)[0]
+    batch_size = encoder_info.get_shape()[0].value
+    if output_fn is None:
+      output_fn = lambda x: x
+    if batch_size is None:
+      batch_size = array_ops.shape(encoder_info)[0]
+
+  def decoder_fn(time, cell_state, cell_input, cell_output, context_state):
+    """ Decoder function used in the `dynamic_rnn_decoder` with the purpose of
+    inference.
+
+    The main difference between this decoder function and the `decoder_fn` in
+    `simple_decoder_fn_train` is how `next_cell_input` is calculated. In this
+    decoder function we calculate the next input by applying an argmax across
+    the feature dimension of the output from the decoder. This is a
+    greedy-search approach. (Bahdanau et al., 2014) & (Sutskever et al., 2014)
+    use beam-search instead.
+
+    Args:
+      time: positive integer constant reflecting the current timestep.
+      cell_state: state of RNNCell.
+      cell_input: input provided by `dynamic_rnn_decoder`.
+      cell_output: output of RNNCell.
+      context_state: context state provided by `dynamic_rnn_decoder`.
+
+    Returns:
+      A tuple (done, next state, next input, emit output, next context state)
+      where:
+
+      done: A boolean vector to indicate which sentences has reached a
+      `end_of_sequence_id`. This is used for early stopping by the
+      `dynamic_rnn_decoder`. When `time>=maximum_length` a boolean vector with
+      all elements as `true` is returned.
+
+      next state: `cell_state`, this decoder function does not modify the
+      given state.
+
+      next input: The embedding from argmax of the `cell_output` is used as
+      `next_input`.
+
+      emit output: If `output_fn is None` the supplied `cell_output` is
+      returned, else the `output_fn` is used to update the `cell_output`
+      before calculating `next_input` and returning `cell_output`.
+
+      next context state: `context_state`, this decoder function does not
+      modify the given context state. The context state could be modified when
+      applying e.g. beam search.
+  """
+    with ops.name_scope(name, "simple_decoder_fn_inference",
+                        [time, cell_state, cell_input, cell_output,
+                         context_state]):
+      if cell_input is not None:
+        raise ValueError("Expected cell_input to be None, but saw: %s" %
+                         cell_input)
+      if cell_output is None:
+        # invariant that this is time == 0
+        next_input_id = array_ops.ones([batch_size,], dtype=dtype) * (
+            start_of_sequence_id)
+        done = array_ops.zeros([batch_size,], dtype=dtypes.bool)
+        cell_state = encoder_state
+        cell_output = array_ops.zeros([num_decoder_symbols],
+                                      dtype=dtypes.float32)
+      else:
+        cell_output = output_fn(cell_output)
+        next_input_id = math_ops.cast(
+            math_ops.argmax(cell_output, 1), dtype=dtype)
+        done = math_ops.equal(next_input_id, end_of_sequence_id)
+      next_input = array_ops.gather(embeddings, next_input_id)
+      # if time > maxlen, return all true vector
+      done = control_flow_ops.cond(math_ops.greater(time, maximum_length),
+          lambda: array_ops.ones([batch_size,], dtype=dtypes.bool),
+          lambda: done)
+      return (done, cell_state, next_input, cell_output, context_state)
+  return decoder_fn
--- a/tensorflow/contrib/seq2seq/python/ops/seq2seq.py
+++ b/tensorflow/contrib/seq2seq/python/ops/seq2seq.py
@ -0,0 +1,208 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Seq2seq layer operations for use in neural networks.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import layers
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope as vs
+
+__all__ = ["dynamic_rnn_decoder"]
+
+def dynamic_rnn_decoder(cell, decoder_fn, inputs=None, sequence_length=None,
+                        parallel_iterations=None, swap_memory=False,
+                        time_major=False, scope=None, name=None):
+  """ Dynamic RNN decoder for a sequence-to-sequence model specified by
+  RNNCell and decoder function.
+
+  The `dynamic_rnn_decoder` is similar to the `tf.python.ops.rnn.dynamic_rnn`
+  as the decoder does not make any assumptions of sequence length and batch
+  size of the input.
+
+  The `dynamic_rnn_decoder` has two modes: training or inference and expects
+  the user to create seperate functions for each.
+
+  Under both training and inference `cell` and `decoder_fn` is expected. Where
+  the `cell` performs computation at every timestep using the `raw_rnn` and
+  the `decoder_fn` allows modelling of early stopping, output, state, and next
+  input and context.
+
+  When training the user is expected to supply `inputs`. At every time step a
+  slice of the supplied input is fed to the `decoder_fn`, which modifies and
+  returns the input for the next time step.
+
+  `sequence_length` is needed at training time, i.e., when `inputs` is not
+  None, for dynamic unrolling. At test time, when `inputs` is None,
+  `sequence_length` is not needed.
+
+  Under inference `inputs` is expected to be `None` and the input is inferred
+  solely from the `decoder_fn`.
+
+  Args:
+    cell: An instance of RNNCell.
+    decoder_fn: A function that takes time, cell state, cell input,
+      cell output and context state. It returns a early stopping vector,
+      cell state, next input, cell output and context state.
+      Examples of decoder_fn can be found in the decoder_fn.py folder.
+    inputs: The inputs for decoding (embedded format).
+
+      If `time_major == False` (default), this must be a `Tensor` of shape:
+        `[batch_size, max_time, ...]`.
+
+      If `time_major == True`, this must be a `Tensor` of shape:
+        `[max_time, batch_size, ...]`.
+
+      The input to `cell` at each time step will be a `Tensor` with dimensions
+        `[batch_size, ...]`.
+    sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
+      if `inputs` is not None and `sequence_length` is None it is inferred
+      from the `inputs` as the maximal possible sequence length.
+    parallel_iterations: (Default: 32).  The number of iterations to run in
+      parallel.  Those operations which do not have any temporal dependency
+      and can be run in parallel, will be.  This parameter trades off
+      time for space.  Values >> 1 use more memory but take less time,
+      while smaller values use less memory but computations take longer.
+    swap_memory: Transparently swap the tensors produced in forward inference
+      but needed for back prop from GPU to CPU.  This allows training RNNs
+      which would typically not fit on a single GPU, with very minimal (or no)
+      performance penalty.
+    time_major: The shape format of the `inputs` and `outputs` Tensors.
+      If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
+      If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
+      Using `time_major = True` is a bit more efficient because it avoids
+      transposes at the beginning and end of the RNN calculation.  However,
+      most TensorFlow data is batch-major, so by default this function
+      accepts input and emits output in batch-major form.
+    scope: VariableScope for the `raw_rnn`;
+      defaults to None.
+    name: NameScope for the decoder;
+      defaults to "dynamic_rnn_decoder"
+
+  Returns:
+    A pair (outputs, state) where:
+
+      outputs: the RNN output 'Tensor'.
+
+        If time_major == False (default), this will be a `Tensor` shaped:
+          `[batch_size, max_time, cell.output_size]`.
+
+        If time_major == True, this will be a `Tensor` shaped:
+          `[max_time, batch_size, cell.output_size]`.
+
+      state: The final state and will be shaped
+             `[batch_size, cell.state_size]`.
+
+  Raises:
+    ValueError: if inputs is not None and has less than three dimensions.
+  """
+  with ops.name_scope(name, "dynamic_rnn_decoder",
+                      [cell, decoder_fn, inputs, sequence_length,
+                       parallel_iterations, swap_memory, time_major, scope]):
+    if inputs is not None:
+      # Convert to tensor
+      inputs = ops.convert_to_tensor(inputs)
+
+      # Test input dimensions
+      if inputs.get_shape().ndims is not None and (
+          inputs.get_shape().ndims < 2):
+        raise ValueError("Inputs must have at least two dimensions")
+      # Setup of RNN (dimensions, sizes, length, initial state, dtype)
+      if not time_major:
+        # [batch, seq, features] -> [seq, batch, features]
+        inputs = array_ops.transpose(inputs, perm=[1, 0, 2])
+
+      dtype = inputs.dtype
+      # Get data input information
+      input_depth = int(inputs.get_shape()[2])
+      batch_depth = inputs.get_shape()[1].value
+      max_time = inputs.get_shape()[0].value
+      if max_time is None:
+        max_time = array_ops.shape(inputs)[0]
+      # Setup decoder inputs as TensorArray
+      inputs_ta = tensor_array_ops.TensorArray(dtype, size=max_time)
+      inputs_ta = inputs_ta.unpack(inputs)
+
+    def loop_fn(time, cell_output, cell_state, loop_state):
+      if cell_state is None:  # first call, before while loop (in raw_rnn)
+        if cell_output is not None:
+          raise ValueError("Expected cell_output to be None when cell_state "
+                           "is None, but saw: %s" % cell_output)
+        if loop_state is not None:
+          raise ValueError("Expected loop_state to be None when cell_state "
+                           "is None, but saw: %s" % loop_state)
+        context_state = None
+      else:  # subsequent calls, inside while loop, after cell excution
+        if isinstance(loop_state, tuple):
+          (done, context_state) = loop_state
+        else:
+          done = loop_state
+          context_state = None
+
+      # call decoder function
+      if inputs is not None:  # training
+        # get next_cell_input
+        if cell_state is None:
+          next_cell_input = inputs_ta.read(0)
+        else:
+          if batch_depth is not None:
+            batch_size = batch_depth
+          else:
+            batch_size = array_ops.shape(done)[0]
+          next_cell_input = control_flow_ops.cond(
+              math_ops.equal(time, max_time),
+              lambda: array_ops.zeros([batch_size, input_depth], dtype=dtype),
+              lambda: inputs_ta.read(time))
+        (next_done, next_cell_state, next_cell_input, emit_output,
+         next_context_state) = decoder_fn(time, cell_state, next_cell_input,
+                                          cell_output, context_state)
+      else:  # inference
+        # next_cell_input is obtained through decoder_fn
+        (next_done, next_cell_state, next_cell_input, emit_output,
+         next_context_state) = decoder_fn(time, cell_state, None, cell_output,
+                                          context_state)
+
+      # check if we are done
+      if next_done is None:  # training
+        next_done = time >= sequence_length
+
+      # build next_loop_state
+      if next_context_state is None:
+        next_loop_state = next_done
+      else:
+        next_loop_state = (next_done, next_context_state)
+
+      return (next_done, next_cell_input, next_cell_state,
+              emit_output, next_loop_state)
+
+    # Run raw_rnn function
+    outputs_ta, state, _ = rnn.raw_rnn(
+        cell, loop_fn, parallel_iterations=parallel_iterations,
+        swap_memory=swap_memory, scope=scope)
+    outputs = outputs_ta.pack()
+
+    if not time_major:
+      # [seq, batch, features] -> [batch, seq, features]
+      outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
+    return outputs, state
--- a/tensorflow/contrib/session_bundle/session_bundle.cc
+++ b/tensorflow/contrib/session_bundle/session_bundle.cc
@ -48,7 +48,7 @@ auto* load_attempt_count = monitoring::Counter<2>::New(
    "model_path", "status");
 auto* load_latency = monitoring::Counter<1>::New(
    "/tensorflow/contrib/session_bundle/load_latency",
-    "Latency in microseconds for SessionBundles that were succesfully loaded.",
+    "Latency in microseconds for SessionBundles that were successfully loaded.",
    "model_path");
 constexpr char kLoadAttemptFail[] = "fail";
 constexpr char kLoadAttemptSuccess[] = "success";
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@ -109,7 +109,7 @@ weights = variables.variable('weights',
 Note that in native TensorFlow, there are two types of variables: regular
 variables and local (transient) variables. The vast majority of variables are
 regular variables: once created, they can be saved to disk using a
-[saver](https://www.tensorflow.org/versions/r0.9/api_docs/python/state_ops.html#Saver).
+[saver](https://www.tensorflow.org/versions/r0.11/api_docs/python/state_ops.html#Saver).
 Local variables are those variables that only exist for the duration of a
 session and are not saved to disk.

@ -215,7 +215,7 @@ Dropout| [slim.dropout](https://www.tensorflow.org/code/tensorflow/contrib/layer
 Flatten | [slim.flatten](https://www.tensorflow.org/code/tensorflow/contrib/layers/python/layers/layers.py)
 MaxPool2D | [slim.max_pool2d](https://www.tensorflow.org/code/tensorflow/contrib/layers/python/layers/layers.py)
 OneHotEncoding | [slim.one_hot_encoding](https://www.tensorflow.org/code/tensorflow/contrib/layers/python/layers/layers.py)
-SeperableConv2 | [slim.seperable_conv2d](https://www.tensorflow.org/code/tensorflow/contrib/layers/python/layers/layers.py)
+SeparableConv2 | [slim.separable_conv2d](https://www.tensorflow.org/code/tensorflow/contrib/layers/python/layers/layers.py)
 UnitNorm | [slim.unit_norm](https://www.tensorflow.org/code/tensorflow/contrib/layers/python/layers/layers.py)

 TF-Slim also provides two meta-operations called `repeat` and `stack` that
@ -901,7 +901,7 @@ slim.evaluation.evaluation_loop(
    log_dir,
    num_evals=num_batches,
    eval_op=names_to_updates.values(),
-    summary_op=tf.merge_summary(summary_ops),
+    summary_op=tf.summary.merge(summary_ops),
    eval_interval_secs=eval_interval_secs)
 ```

--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@ -283,5 +283,3 @@ def evaluation_loop(master,
      config=session_config,
      max_number_of_evaluations=max_number_of_evaluations,
      timeout=timeout)
-
-
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@ -625,7 +625,7 @@ class TrainTest(tf.test.TestCase):
      tf.set_random_seed(2)
      train_op = self.create_train_op()

-      model_variables = tf.all_variables()
+      model_variables = tf.global_variables()
      model_path = os.path.join(logdir1, 'model.ckpt-300')

      init_op = tf.global_variables_initializer()
@ -674,7 +674,7 @@ class TrainTest(tf.test.TestCase):
      tf.set_random_seed(2)
      train_op = self.create_train_op()

-      model_variables = tf.all_variables()
+      model_variables = tf.global_variables()
      model_path = os.path.join(logdir1, 'model.ckpt-300')
      saver = tf.train.Saver(model_variables)
      def RestoreFn(sess):
--- a/tensorflow/contrib/slim/python/slim/model_analyzer.py
+++ b/tensorflow/contrib/slim/python/slim/model_analyzer.py
@ -84,7 +84,7 @@ def analyze_vars(variables, print_info=False):
  """Prints the names and shapes of the variables.

  Args:
-    variables: list of variables, for example tf.all_variables().
+    variables: list of variables, for example tf.global_variables().
    print_info: Optional, if true print variables and their shape.

  Returns:
--- a/tensorflow/contrib/specs/python/specs_test.py
+++ b/tensorflow/contrib/specs/python/specs_test.py
@ -197,7 +197,7 @@ class SpecsTest(tf.test.TestCase):
                initializer=tf.constant_initializer(42.0))
      inputs = tf.constant(_rand(10, 100))
      outputs = v.funcall(inputs)
-      self.assertEqual(len(tf.all_variables()), 1)
+      self.assertEqual(len(tf.global_variables()), 1)
      sess.run([outputs.initializer])
      outputs_value = outputs.eval()
      self.assertEqual(outputs_value.shape, (2, 2))
@ -211,7 +211,7 @@ class SpecsTest(tf.test.TestCase):
        g = f | f | f | f
      inputs = tf.constant(_rand(10, 100))
      _ = g.funcall(inputs)
-      self.assertEqual(len(tf.all_variables()), 2)
+      self.assertEqual(len(tf.global_variables()), 2)

  def testAutoFunction(self):
    with self.test_session():
--- a/tensorflow/contrib/stat_summarizer/python/stat_summarizer_test.py
+++ b/tensorflow/contrib/stat_summarizer/python/stat_summarizer_test.py
@ -34,7 +34,7 @@ class StatSummarizerTest(tf.test.TestCase):
          graph_def.SerializeToString())

      with self.test_session() as sess:
-        sess.run(tf.initialize_all_variables())
+        sess.run(tf.global_variables_initializer())

        for _ in range(20):
          run_metadata = tf.RunMetadata()
--- a/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
+++ b/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
@ -31,7 +31,7 @@ class ProjectorApiTest(tf.test.TestCase):
    # Create a dummy configuration.
    config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
    config.model_checkpoint_path = 'test'
-    emb1 = config.embedding.add()
+    emb1 = config.embeddings.add()
    emb1.tensor_name = 'tensor1'
    emb1.metadata_path = 'metadata1'

@ -47,3 +47,7 @@ class ProjectorApiTest(tf.test.TestCase):
      config2 = tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
      text_format.Parse(f.read(), config2)
      self.assertEqual(config, config2)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/tensorflow/contrib/training/python/training/device_setter.py
+++ b/tensorflow/contrib/training/python/training/device_setter.py
@ -42,7 +42,7 @@ class GreedyLoadBalancingStrategy(object):
  off CPU-intensive ops with RAM-intensive ops with network bandwidth.

  This class is intended to be used as a `ps_strategy` in
-  `tf.replica_device_setter`.
+  `tf.train.replica_device_setter`.
  """

  def __init__(self, num_tasks, load_fn):
--- a/tensorflow/contrib/training/python/training/evaluation_test.py
+++ b/tensorflow/contrib/training/python/training/evaluation_test.py
@ -51,7 +51,7 @@ class CheckpointIteratorTest(tf.test.TestCase):
    saver = tf.train.Saver()  # Saves the global step.

    with self.test_session() as session:
-      session.run(tf.initialize_all_variables())
+      session.run(tf.global_variables_initializer())
      save_path = os.path.join(checkpoint_dir, 'model.ckpt')
      saver.save(session, save_path, global_step=global_step)

@ -81,7 +81,7 @@ class CheckpointIteratorTest(tf.test.TestCase):
        target='',
        config=tf.ConfigProto(device_count={'CPU': 2})) as session:

-      session.run(tf.initialize_all_variables())
+      session.run(tf.global_variables_initializer())
      save_path = os.path.join(checkpoint_dir, 'model.ckpt')
      saver.save(session, save_path, global_step=global_step)

--- a/tensorflow/contrib/training/python/training/training_test.py
+++ b/tensorflow/contrib/training/python/training/training_test.py
@ -310,7 +310,7 @@ class TrainTest(tf.test.TestCase):
      tf.set_random_seed(2)
      train_op = self.create_train_op()

-      model_variables = tf.all_variables()
+      model_variables = tf.global_variables()
      model_path = os.path.join(logdir1, 'model.ckpt-300')

      assign_fn = tf.contrib.framework.assign_from_checkpoint_fn(
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -949,12 +949,12 @@ cc_library(
 # Libraries with GPU facilities that are useful for writing kernels.
 cc_library(
    name = "gpu_lib",
-    srcs = if_not_windows([
+    srcs = [
        "common_runtime/gpu/gpu_event_mgr.cc",
-    ]),
-    hdrs = if_not_windows([
+    ],
+    hdrs = [
        "common_runtime/gpu/gpu_event_mgr.h",
-    ]),
+    ],
    copts = tf_copts(),
    visibility = ["//visibility:public"],
    deps = [
@ -964,7 +964,8 @@ cc_library(
        ":lib_internal",
        ":proto_text",
        ":protos_all_cc",
-    ] + if_not_windows([":stream_executor"]),
+        ":stream_executor",
+    ],
 )

 cc_library(
@ -982,7 +983,7 @@ tf_proto_library_cc(
    name = "worker_proto",
    srcs = ["protobuf/worker.proto"],
    cc_api_version = 2,
-    cc_libs = [":protos_all_cc"],
+    protodeps = [":protos_all"],
    visibility = [
        "//tensorflow:internal",
    ],
@ -993,8 +994,8 @@ tf_proto_library_cc(
    srcs = ["protobuf/worker_service.proto"],
    has_services = 1,
    cc_api_version = 2,
-    cc_libs = [":worker_proto_cc"],
    cc_stubby_versions = ["2"],
+    protodeps = [":worker_proto"],
    visibility = [
        "//tensorflow:internal",
    ],
@ -1004,7 +1005,7 @@ tf_proto_library_cc(
    name = "master_proto",
    srcs = ["protobuf/master.proto"],
    cc_api_version = 2,
-    cc_libs = [":protos_all_cc"],
+    protodeps = [":protos_all"],
    visibility = [
        "//tensorflow:internal",
    ],
@ -1015,8 +1016,8 @@ tf_proto_library_cc(
    srcs = ["protobuf/master_service.proto"],
    has_services = 1,
    cc_api_version = 2,
-    cc_libs = [":master_proto_cc"],
    cc_stubby_versions = ["2"],
+    protodeps = [":master_proto"],
    visibility = [
        "//tensorflow:internal",
    ],
@ -1417,7 +1418,7 @@ tf_cuda_library(

 tf_cuda_library(
    name = "gpu_runtime",
-    srcs = if_not_windows([
+    srcs = [
        "common_runtime/gpu/gpu_bfc_allocator.cc",
        "common_runtime/gpu/gpu_debug_allocator.cc",
        "common_runtime/gpu/gpu_device.cc",
@ -1429,8 +1430,8 @@ tf_cuda_library(
        "common_runtime/gpu/pool_allocator.cc",
        "common_runtime/gpu/process_state.cc",
        "common_runtime/gpu_device_context.h",
-    ]),
-    hdrs = if_not_windows([
+    ],
+    hdrs = [
        "common_runtime/gpu/gpu_bfc_allocator.h",
        "common_runtime/gpu/gpu_debug_allocator.h",
        "common_runtime/gpu/gpu_device.h",
@ -1439,7 +1440,7 @@ tf_cuda_library(
        "common_runtime/gpu/gpu_util.h",
        "common_runtime/gpu/pool_allocator.h",
        "common_runtime/gpu/process_state.h",
-    ]),
+    ],
    copts = tf_copts(),
    linkstatic = 1,
    deps = [
@ -1451,10 +1452,9 @@ tf_cuda_library(
        ":lib",
        ":lib_internal",
        ":protos_all_cc",
-        "//third_party/eigen3",
-    ] + if_not_windows([
        ":stream_executor",
-    ]),
+        "//third_party/eigen3",
+    ],
    alwayslink = 1,
 )

--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@ -835,7 +835,7 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib) {
  FunctionLibraryDefinition flib(OpRegistry::Global(), library_graph_def);
  Graph g(&flib);
  Tensor t(DT_FLOAT, TensorShape({}));
-  t.scalar<float>()() = {1.2};
+  t.scalar<float>()() = {1.2f};
  Node* x = test::graph::Constant(&g, t);
  Node* y;
  if (use_function_lib) {
@ -945,7 +945,7 @@ TEST(DirectSessionTest, TestSessionInterOpThreadsWithFunctions) {
 TEST(DirectSessionTest, TestSessionInterOpThreadsInvalidOptions) {
  Graph g(OpRegistry::Global());
  Tensor t(DT_FLOAT, TensorShape({}));
-  t.scalar<float>()() = {1.2};
+  t.scalar<float>()() = {1.2f};
  Node* x = test::graph::Constant(&g, t);
  GraphDef def;
  test::graph::ToGraphDef(&g, &def);
@ -979,7 +979,7 @@ TEST(DirectSessionTest, TestDirectSessionRunClose) {
  // Construct a graph with a variable and a single assign.
  Graph g(OpRegistry::Global());
  Tensor t(DT_FLOAT, TensorShape({}));
-  t.scalar<float>()() = {1.2};
+  t.scalar<float>()() = {1.2f};
  Node* var_val = test::graph::Constant(&g, t);
  Node* var = test::graph::Var(&g, DT_FLOAT, {});
  Node* var_assign = test::graph::Assign(&g, var, var_val);
@ -1063,7 +1063,7 @@ TEST(DirectSessionTest, TestDirectSessionReset) {
  // Construct a graph with a variable and a single assign.
  Graph g(OpRegistry::Global());
  Tensor t(DT_FLOAT, TensorShape({}));
-  t.scalar<float>()() = {1.2};
+  t.scalar<float>()() = {1.2f};
  Node* var_val = test::graph::Constant(&g, t);
  Node* var = test::graph::Var(&g, DT_FLOAT, {});
  Node* var_assign = test::graph::Assign(&g, var, var_val);
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@ -39,7 +39,7 @@ class StepStatsCollector;
 //   Rendezvous* rendezvous = NewNaiveRendezvous();
 //   TF_CHECK_OK(rendezvous->Send("input", some_input_tensor));
 //   TF_CHECK_OK(executor->Run({ExecutorOpts, rendezvous, nullptr}));
-//   TF_CHECK_OK(rendezvous->Recv("input", &output_tensor));
+//   TF_CHECK_OK(rendezvous->Recv("output", &output_tensor));
 //   ... ...
 //
 // Multiple threads can call Executor::Run concurrently.
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
@ -19,16 +19,26 @@ limitations under the License.

 namespace tensorflow {

-SYCLAllocator::~SYCLAllocator() { }
+SYCLAllocator::~SYCLAllocator() {}

 string SYCLAllocator::Name() { return "device:SYCL"; }

 void *SYCLAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
+  assert(device_);
  auto p = device_->allocate(num_bytes);
  return p;
 }

-void SYCLAllocator::DeallocateRaw(void *ptr) { device_->deallocate(ptr); }
+void SYCLAllocator::DeallocateRaw(void *ptr) {
+  if (device_) {
+    device_->deallocate(ptr);
+  }
+}
+
+void SYCLAllocator::EnterLameDuckMode() {
+  device_->deallocate_all();
+  device_ = nullptr;
+}

 } // namespace tensorflow

--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
@ -29,14 +29,16 @@ namespace tensorflow {

 class SYCLAllocator : public Allocator {
 public:
-  SYCLAllocator(Eigen::SyclDevice* device) : device_(device) {}
+  SYCLAllocator(Eigen::QueueInterface* device) : device_(device) {}
  virtual ~SYCLAllocator() override;
  string Name() override;
  void *AllocateRaw(size_t alignment, size_t num_bytes) override;
  void DeallocateRaw(void *ptr) override;

+  void EnterLameDuckMode();
+  virtual bool ShouldAllocateEmptyTensors() override final { return true; }
 private:
-  Eigen::SyclDevice *device_;  // not owned
+  Eigen::QueueInterface *device_;  // not owned
  TF_DISALLOW_COPY_AND_ASSIGN(SYCLAllocator);
 };

--- a/tensorflow/core/common_runtime/sycl/sycl_device.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc
@ -25,8 +25,9 @@ namespace tensorflow {

 SYCLDevice::~SYCLDevice() {
  device_context_->Unref();
-  delete sycl_allocator_;
+  sycl_allocator_->EnterLameDuckMode();
  delete sycl_device_;
+  delete sycl_queue_;
 }

 void SYCLDevice::Compute(OpKernel *op_kernel, OpKernelContext *context) {
@ -50,12 +51,8 @@ Allocator *SYCLDevice::GetAllocator(AllocatorAttributes attr) {
 Status SYCLDevice::MakeTensorFromProto(const TensorProto &tensor_proto,
                                       const AllocatorAttributes alloc_attrs,
                                       Tensor *tensor) {
-  AllocatorAttributes attr;
-  attr.set_on_host(true);
-  attr.set_gpu_compatible(true);
-  Allocator *host_alloc = GetAllocator(attr);
  Tensor parsed(tensor_proto.dtype());
-  if (!parsed.FromProto(host_alloc, tensor_proto)) {
+  if (!parsed.FromProto(cpu_allocator_, tensor_proto)) {
    return errors::InvalidArgument("Cannot parse tensor from proto: ",
                                   tensor_proto.DebugString());
  }
@ -86,6 +83,12 @@ Status SYCLDevice::FillContextMap(const Graph *graph,
  return Status::OK();
 }

+Status SYCLDevice::Sync() {
+  sycl_device_->synchronize();
+  return Status::OK();
+}
+
+
 } // namespace tensorflow

 #endif // TENSORFLOW_USE_SYCL
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@ -22,7 +22,6 @@ limitations under the License.

 #define EIGEN_USE_SYCL

-#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/common_runtime/sycl/sycl_allocator.h"
 #include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
@ -30,7 +29,6 @@ limitations under the License.

 namespace tensorflow {

-
 class SYCLDevice : public LocalDevice {
 public:
  template <typename SYCLSelector>
@ -42,8 +40,9 @@ public:
                    name, DEVICE_SYCL, memory_limit, locality,
                    physical_device_desc), nullptr),
        cpu_allocator_(cpu_allocator),
-        sycl_device_(new Eigen::SyclDevice(sycl_selector)),
-        sycl_allocator_(new SYCLAllocator(sycl_device_)),
+        sycl_queue_(new Eigen::QueueInterface(sycl_selector)),
+        sycl_device_(new Eigen::SyclDevice(sycl_queue_)),
+        sycl_allocator_(new SYCLAllocator(sycl_queue_)),
        device_context_(new SYCLDeviceContext()) {
    set_eigen_sycl_device(sycl_device_);
  }
@ -59,16 +58,17 @@ public:
  Status FillContextMap(const Graph *graph,
                        DeviceContextMap *device_context_map) override;

-  Status Sync() override { return Status::OK(); }
+  Status Sync() override;
  static string GetShortDeviceDescription(/*int device_id,
                                          const DeviceDescription& desc*/) {
    return strings::StrCat("device: 0, name SYCL, pci bus id: 0");
  }

 private:
-  Allocator *cpu_allocator_;         // owned
-  Eigen::SyclDevice* sycl_device_;   // owned
-  SYCLAllocator *sycl_allocator_;    // owned
+  Allocator *cpu_allocator_;          // owned
+  Eigen::QueueInterface* sycl_queue_; // owned
+  Eigen::SyclDevice* sycl_device_;    // owned
+  SYCLAllocator *sycl_allocator_;     // owned
  SYCLDeviceContext *device_context_;
 };

--- a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
@ -16,13 +16,11 @@ limitations under the License.
 #if TENSORFLOW_USE_SYCL

 #define EIGEN_USE_SYCL
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"

 #include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"

-#define EIGEN_USE_SYCL
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
 namespace tensorflow {

 void SYCLDeviceContext::CopyCPUTensorToDevice(const Tensor *cpu_tensor,
@ -108,7 +106,6 @@ void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor,
                                              StatusCallback done) {
  const int64 total_bytes = device_tensor->TotalBytes();
  if (total_bytes > 0) {
-    device->eigen_sycl_device()->deallocate_all();
    const void* src_ptr = DMAHelper::base(device_tensor);
    void* dst_ptr = DMAHelper::base(cpu_tensor);
    switch (device_tensor->dtype()) {
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@ -15,6 +15,7 @@ limitations under the License.

 #if TENSORFLOW_USE_SYCL

+#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/sycl/sycl_device.h"

 namespace tensorflow {
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@ -36,7 +36,7 @@ tf_proto_library_cc(
    has_services = 1,
    cc_api_version = 2,
    cc_grpc_version = 1,
-    cc_libs = ["//tensorflow/core:protos_all_cc"],
+    protodeps = ["//tensorflow/core:protos_all"],
 )

 # Depending on this target causes a concrete DebuggerState implementation
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@ -372,9 +372,9 @@ TEST_F(SessionDebugMinusAXTest,
  debug_gateway.SetNodeValueCallback(
      [this, &mu, &val_callback_count, &a_debug_identity_node_name,
       &x_debug_identity_node_name, &y_debug_identity_node_name,
-       &debug_identity_tensor_vals,
-       &callbacks_done](const string& node_name, const int output_slot,
-                        const Tensor& tensor_value, const bool is_ref) {
+       &debug_identity_tensor_vals, &callbacks_done, &kConcurrentRuns](
+           const string& node_name, const int output_slot,
+           const Tensor& tensor_value, const bool is_ref) {
        mutex_lock l(mu);

        if (node_name == a_debug_identity_node_name && output_slot == 0) {
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@ -18,6 +18,12 @@ limitations under the License.
 #include <vector>

 #include "grpc++/create_channel.h"
+
+#if defined(PLATFORM_WINDOWS)
+// winsock2.h is used in grpc, so Ws2_32.lib is needed
+#pragma comment(lib,"Ws2_32.lib")
+#endif
+
 #include "tensorflow/core/debug/debug_service.grpc.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/io/path.h"
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@ -273,7 +273,8 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {

  auto fn = [this, &dump_count, &done_count, &mu, &dump_root_base, &dump_roots,
             &dump_file_paths, &wall_time, &tensor_name, &debug_node_name,
-             &kNodeName, &kDebugOpName, &kConcurrentPubs, &all_done]() {
+             &kNodeName, &kDebugOpName, &kConcurrentPubs, &kOutputSlot,
+             &all_done]() {
    // "gumpy" is the shared directory part of the path.
    string dump_root;
    string debug_url;
--- a/tensorflow/core/framework/partial_tensor_shape_test.cc
+++ b/tensorflow/core/framework/partial_tensor_shape_test.cc
@ -220,7 +220,7 @@ TEST(PartialTensorShapeTest, PartialShapeMergeWith) {

 TEST(PartialTensorShapeTest, MakePartialShapeEmpty) {
  // Empty made partial shapes should still be fully defined
-  const int64 dims[0] = {};
+  const int64 dims[1] = {};
  PartialTensorShape shape;
  EXPECT_FALSE(shape.IsFullyDefined());
  TF_ASSERT_OK(PartialTensorShape::MakePartialShape(dims, 0, &shape));
--- a/tensorflow/core/framework/tensor_testutil.h
+++ b/tensorflow/core/framework/tensor_testutil.h
@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_FRAMEWORK_TENSOR_TESTUTIL_H_
 #define TENSORFLOW_FRAMEWORK_TENSOR_TESTUTIL_H_

+#include <numeric>
+
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -2342,7 +2342,6 @@ cc_library(
        ":batch_norm_op",
        ":bias_op",
        ":conv_ops",
-        ":depthwise_conv_grad_op",
        ":dilation_ops",
        ":fused_batch_norm_op",
        ":in_topk_op",
@ -2354,7 +2353,10 @@ cc_library(
        ":softsign_op",
        ":topk_op",
        ":xent_op",
-    ] + if_not_windows([":depthwise_conv_op"]),
+    ] + if_not_windows([
+        ":depthwise_conv_grad_op",
+        ":depthwise_conv_op",
+    ]),
 )

 NN_DEPS = [
--- a/tensorflow/core/kernels/adjust_contrast_op_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc
@ -56,7 +56,7 @@ TEST_F(AdjustContrastOpTest, Simple_1223) {
  TF_EXPECT_OK(InitOp());
  AddInputFromArray<float>(TensorShape({1, 2, 2, 3}),
                           {1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8, 12});
-  AddInputFromArray<float>(TensorShape({}), {0.2});
+  AddInputFromArray<float>(TensorShape({}), {0.2f});
  TF_ASSERT_OK(RunOpKernel());

  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 3}));
@ -78,7 +78,7 @@ TEST_F(AdjustContrastOpTest, Big_99x99x3) {
  }

  AddInputFromArray<float>(TensorShape({1, 99, 99, 3}), values);
-  AddInputFromArray<float>(TensorShape({}), {0.2});
+  AddInputFromArray<float>(TensorShape({}), {0.2f});
  TF_ASSERT_OK(RunOpKernel());
 }

--- a/tensorflow/core/kernels/batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/batch_norm_op_test.cc
@ -47,15 +47,15 @@ TEST_F(BatchNormOpTest, Simple) {
  AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
                           {1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6});
  AddInputFromArray<float>(TensorShape({2}), {10, 20});
-  AddInputFromArray<float>(TensorShape({2}), {0.25, 0.5});
-  AddInputFromArray<float>(TensorShape({2}), {0.1, 0.6});
-  AddInputFromArray<float>(TensorShape({2}), {0.0, 0.0});
+  AddInputFromArray<float>(TensorShape({2}), {0.25f, 0.5f});
+  AddInputFromArray<float>(TensorShape({2}), {0.1f, 0.6f});
+  AddInputFromArray<float>(TensorShape({2}), {0.0f, 0.0f});
  TF_ASSERT_OK(RunOpKernel());

  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
  test::FillValues<float>(
-      &expected, {-17.86, -22.00, -15.87, -20.59, -13.87, -19.18, -21.86,
-                  -33.31, -23.85, -34.72, -25.85, -36.13});
+      &expected, {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f, -21.86f,
+                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f });
  test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
 }

--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@ -49,17 +49,18 @@ class CastOpTest : public OpsTestBase {
    TF_EXPECT_OK(InitOp());
  }

-  template <typename IN, typename OUT>
+  template <typename INPUT, typename OUTPUT>
  void CheckCast() {
-    DataType in_type = DataTypeToEnum<IN>::v();
-    DataType out_type = DataTypeToEnum<OUT>::v();
+    DataType in_type = DataTypeToEnum<INPUT>::v();
+    DataType out_type = DataTypeToEnum<OUTPUT>::v();
    MakeOp(in_type, out_type);
-    AddInputFromArray<IN>(TensorShape({1, 2, 2, 1}),
-                          {IN(1), IN(2), IN(3), IN(4)});
+    AddInputFromArray<INPUT>(TensorShape({1, 2, 2, 1}),
+                             {INPUT(1), INPUT(2), INPUT(3), INPUT(4)});
    TF_ASSERT_OK(RunOpKernel());
    Tensor expected(allocator(), out_type, TensorShape({1, 2, 2, 1}));
-    test::FillValues<OUT>(&expected, {OUT(1), OUT(2), OUT(3), OUT(4)});
-    test::ExpectTensorEqual<OUT>(expected, *GetOutput(0));
+    test::FillValues<OUTPUT>(&expected,
+                             {OUTPUT(1), OUTPUT(2), OUTPUT(3), OUTPUT(4)});
+    test::ExpectTensorEqual<OUTPUT>(expected, *GetOutput(0));
  }
 };

--- a/tensorflow/core/kernels/colorspace_op_test.cc
+++ b/tensorflow/core/kernels/colorspace_op_test.cc
@ -71,7 +71,7 @@ class RGBToHSVOpTest : public OpsTestBase {

  void CheckRedMax(DataType data_type) {
    // Test case where red channel dominates
-    AddInputFromArray<T>(TensorShape({3}), {.8, .4, .2});
+    AddInputFromArray<T>(TensorShape({3}), {.8f, .4f, .2f});
    TF_ASSERT_OK(RunOpKernel());

    T expected_h = 1. / 6. * .2 / .6;
@ -85,7 +85,7 @@ class RGBToHSVOpTest : public OpsTestBase {

  void CheckGreenMax(DataType data_type) {
    // Test case where green channel dominates
-    AddInputFromArray<T>(TensorShape({3}), {.2, .8, .4});
+    AddInputFromArray<T>(TensorShape({3}), {.2f, .8f, .4f});
    TF_ASSERT_OK(RunOpKernel());

    T expected_h = 1. / 6. * (2.0 + (.2 / .6));
@ -99,7 +99,7 @@ class RGBToHSVOpTest : public OpsTestBase {

  void CheckBlueMax(DataType data_type) {
    // Test case where blue channel dominates
-    AddInputFromArray<T>(TensorShape({3}), {.4, .2, .8});
+    AddInputFromArray<T>(TensorShape({3}), {.4f, .2f, .8f});
    TF_ASSERT_OK(RunOpKernel());

    T expected_h = 1. / 6. * (4.0 + (.2 / .6));
@ -112,7 +112,7 @@ class RGBToHSVOpTest : public OpsTestBase {
  }

  void CheckNegativeDifference(DataType data_type) {
-    AddInputFromArray<T>(TensorShape({3}), {0, .1, .2});
+    AddInputFromArray<T>(TensorShape({3}), {0, .1f, .2f});
    TF_ASSERT_OK(RunOpKernel());

    T expected_h = 1. / 6. * (4.0 + (-.1 / .2));
@ -220,7 +220,7 @@ class HSVToRGBOpTest : public OpsTestBase {
    TF_ASSERT_OK(RunOpKernel());

    Tensor expected(allocator(), data_type, TensorShape({3}));
-    test::FillValues<T>(&expected, {0, .1, .2});
+    test::FillValues<T>(&expected, {0, .1f, .2f});
    test::ExpectTensorNear<T>(expected, *GetOutput(0), 1e-6);
  }
 };
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@ -113,9 +113,12 @@ REGISTER_GPU_HOST_REF_KERNEL(string);
 #undef REGISTER_GPU_HOST_REF_KERNEL

 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(          \
-      Name("Switch").Device(DEVICE_SYCL).TypeConstraint<type>("T"), SwitchOp)
+#define REGISTER_SYCL_KERNEL(type)                       \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
+                              .Device(DEVICE_SYCL)       \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("pred"),       \
+                          SwitchOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
@ -219,9 +222,12 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_REF_KERNEL

 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(          \
-  Name("Merge").Device(DEVICE_SYCL).TypeConstraint<type>("T"), MergeOp)
+#define REGISTER_SYCL_KERNEL(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("Merge")                   \
+                              .Device(DEVICE_SYCL)        \
+                              .TypeConstraint<type>("T")  \
+                              .HostMemory("value_index"), \
+                          MergeOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
@ -418,8 +424,12 @@ REGISTER_GPU_HOST_KERNEL(string);

 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(          \
-	Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"), NextIterationOp)
+  REGISTER_KERNEL_BUILDER(Name("NextIteration")           \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          NextIterationOp)
  REGISTER_SYCL_KERNEL(bool);
  TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
--- a/tensorflow/core/kernels/control_flow_ops_test.cc
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@ -85,13 +85,27 @@ class AbortOpTest : public OpsTestBase {
 protected:
 };

+#ifdef PLATFORM_WINDOWS
+#define SIGABRT 3
+
+class KilledBySignal {
+ public:
+  explicit KilledBySignal(int signum) : signum_(signum) {}
+  bool operator()(int exit_status) const { return exit_status == signum_; }
+ private:
+  const int signum_;
+};
+#else
+#define KilledBySignal ::testing::KilledBySignal
+#endif
+
 // Pass an error message to the op.
 TEST_F(AbortOpTest, pass_error_msg) {
  TF_ASSERT_OK(NodeDefBuilder("abort_op", "Abort")
                   .Attr("error_msg", "abort_op_test")
                   .Finalize(node_def()));
  TF_ASSERT_OK(InitOp());
-  EXPECT_EXIT(RunOpKernel(), ::testing::KilledBySignal(SIGABRT),
+  EXPECT_EXIT(RunOpKernel(), KilledBySignal(SIGABRT),
              "Abort_op intentional failure; abort_op_test");
 }

@ -99,7 +113,7 @@ TEST_F(AbortOpTest, pass_error_msg) {
 TEST_F(AbortOpTest, default_msg) {
  TF_ASSERT_OK(NodeDefBuilder("abort_op", "Abort").Finalize(node_def()));
  TF_ASSERT_OK(InitOp());
-  EXPECT_EXIT(RunOpKernel(), ::testing::KilledBySignal(SIGABRT),
+  EXPECT_EXIT(RunOpKernel(), KilledBySignal(SIGABRT),
              "Abort_op intentional failure; ");
 }

--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@ -21,6 +21,18 @@ REGISTER5(UnaryOp, CPU, "Abs", functor::abs, float, Eigen::half, double, int32,
 #if !defined(IS_MOBILE_PLATFORM)
 REGISTER2(UnaryOp, CPU, "ComplexAbs", functor::abs, complex64, complex128);
 #endif
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Abs")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::abs<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER4(UnaryOp, GPU, "Abs", functor::abs, float, Eigen::half, double, int64);
 REGISTER2(UnaryOp, GPU, "ComplexAbs", functor::abs, complex64, complex128);
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@ -17,6 +17,18 @@ limitations under the License.

 namespace tensorflow {
 REGISTER2(UnaryOp, CPU, "Acos", functor::acos, float, double);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Acos")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::acos<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double);
 #endif
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@ -26,7 +26,7 @@ REGISTER5(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
                          .Device(DEVICE_SYCL)                        \
                          .TypeConstraint<TYPE>("T"),                 \
                          BinaryOp<SYCLDevice, functor::add<TYPE>>);
-TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+  REGISTER_SYCL_KERNEL(float);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
          
--- a/tensorflow/core/kernels/cwise_op_asin.cc
+++ b/tensorflow/core/kernels/cwise_op_asin.cc
@ -17,6 +17,18 @@ limitations under the License.

 namespace tensorflow {
 REGISTER2(UnaryOp, CPU, "Asin", functor::asin, float, double);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Asin")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::asin<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double);
 #endif
--- a/tensorflow/core/kernels/cwise_op_atan.cc
+++ b/tensorflow/core/kernels/cwise_op_atan.cc
@ -17,6 +17,18 @@ limitations under the License.

 namespace tensorflow {
 REGISTER2(UnaryOp, CPU, "Atan", functor::atan, float, double);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Atan")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::atan<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double);
 #endif
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@ -17,6 +17,18 @@ limitations under the License.

 namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Ceil")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::ceil<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
 #endif
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double,
          complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Cos")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::cos<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
 #endif
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@ -30,6 +30,11 @@ REGISTER5(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
                          Name("Div")                                 \
                          .Device(DEVICE_SYCL)                        \
                          .TypeConstraint<TYPE>("T"),                 \
+                          BinaryOp<SYCLDevice, functor::div<TYPE>>);  \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("RealDiv")                             \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
                          BinaryOp<SYCLDevice, functor::div<TYPE>>);
 REGISTER_SYCL_KERNEL(float)
 #undef REGISTER_SYCL_KERNEL
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, double,
          complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Exp")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::exp<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double);
 #endif
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@ -17,6 +17,18 @@ limitations under the License.

 namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Floor")                               \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::floor<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
 #endif
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@ -18,6 +18,9 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
          int16, int32, int64);
+REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
+          Eigen::half, double);
+
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                    \
  REGISTER_KERNEL_BUILDER(                                            \
@ -25,11 +28,10 @@ REGISTER5(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
                          .Device(DEVICE_SYCL)                        \
                          .TypeConstraint<TYPE>("T"),                 \
                          BinaryOp<SYCLDevice, functor::floor_div<TYPE>>);
-TF_CALL_INTEGRAL_TYPES(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(float)
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
-REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
-          Eigen::half, double);
+
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "FloorDiv", functor::floor_div, uint8, uint16, int16,
          int64);
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@ -18,6 +18,7 @@ limitations under the License.
 namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
          double);
+
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                    \
  REGISTER_KERNEL_BUILDER(                                            \
@ -25,9 +26,10 @@ REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
                          .Device(DEVICE_SYCL)                        \
                          .TypeConstraint<TYPE>("T"),                 \
                          UnaryOp<SYCLDevice, functor::isfinite<TYPE>>);
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(float);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
          double);
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@ -17,6 +17,7 @@ limitations under the License.

 namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double);
+
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                    \
  REGISTER_KERNEL_BUILDER(                                            \
@ -24,9 +25,10 @@ REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double);
                          .Device(DEVICE_SYCL)                        \
                          .TypeConstraint<TYPE>("T"),                 \
                          UnaryOp<SYCLDevice, functor::isinf<TYPE>>);
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(float);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
 #endif
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@ -17,6 +17,7 @@ limitations under the License.

 namespace tensorflow {
 REGISTER3(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double);
+
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                    \
  REGISTER_KERNEL_BUILDER(                                            \
@ -24,9 +25,10 @@ REGISTER3(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double);
                          .Device(DEVICE_SYCL)                        \
                          .TypeConstraint<TYPE>("T"),                 \
                          UnaryOp<SYCLDevice, functor::isnan<TYPE>>);
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
+REGISTER_SYCL_KERNEL(float);
 #undef REGISTER_SYCL_KERNEL
 #endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double);
 #endif
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
          complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Log")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::log<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
 #endif
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double,
          complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Log1p")                               \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::log1p<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
 #endif
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER7(UnaryOp, CPU, "Neg", functor::neg, float, Eigen::half, double, int32,
          complex64, int64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Neg")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::neg<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER4(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64);

--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER7(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double, int32,
          int64, complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Pow")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          BinaryOp<SYCLDevice, functor::pow<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Pow", functor::pow, float, Eigen::half, double,
          int64);
--- a/tensorflow/core/kernels/cwise_op_rsqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_rsqrt.cc
@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double,
          complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Rsqrt")                               \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::rsqrt<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYCL
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double);
 #endif
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double,
          complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Sin")                                 \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::sin<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYC
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double);
 #endif
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@ -18,6 +18,18 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double,
          complex64, complex128);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+                          Name("Sqrt")                                \
+                          .Device(DEVICE_SYCL)                        \
+                          .TypeConstraint<TYPE>("T"),                 \
+                          UnaryOp<SYCLDevice, functor::sqrt<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+#undef REGISTER_SYCL_KERNEL
+#endif // TENSORFLOW_USE_SYC
+
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
 #endif
--- a/Show More
+++ b/Show More