diff --git a/README.md b/README.md
index 923b094e4a8..e0ac8f6eff9 100644
--- a/README.md
+++ b/README.md
@@ -33,10 +33,10 @@ and discussion.**
 
 People who are a little more adventurous can also try our nightly binaries:
 
-* Linux CPU-only: [Python 2](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-cp27-none-linux_x86_64.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-cp34-cp34m-linux_x86_64.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](http://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-cp35-cp35m-linux_x86_64.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-cp27-none-linux_x86_64.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-cp34-cp34m-linux_x86_64.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/140/artifact/pip_test/whl/tensorflow-0.8.0-cp35-cp35m-linux_x86_64.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-py2-none-any.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/)) / [Python 3](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-py3-none-any.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/))
-* Mac GPU: [Python 2](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-py2-none-any.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-py3-none-any.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Linux CPU-only: [Python 2](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](http://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/140/artifact/pip_test/whl/tensorflow-0.8.0-cp35-cp35m-linux_x86_64.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-linux-gpu/TF_BUILD_CONTAINER_TYPE=GPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py2-none-any.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac1-slave/)) / [Python 3](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py3-none-any.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac1-slave/))
+* Mac GPU: [Python 2](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py2-none-any.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-py3-none-any.whl) ([build history](http://ci.tensorflow.org/view/Nightly/job/nigntly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
 * [Android](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/lastSuccessfulBuild/artifact/bazel-out/local_linux/bin/tensorflow/examples/android/tensorflow_demo.apk) ([build history](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/))
 
 #### *Try your first TensorFlow program*
diff --git a/RELEASE.md b/RELEASE.md
index dd9558dd90b..4c9c33bf0dc 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,16 +1,40 @@
-# Changes Since Last Release
 
-## Features and Improvements
-* Connectionist Temporal Classification ops are now "official" (see, e.g.,
-  `tf.nn.ctc_loss`)
-* Preliminary graph-construction C API, for use by language bindings.
-* Major revision to the graph-construction C++ API. Scoping mechanism to make op
-  naming, specifying control dependencies etc. more consistent. C++ values can
-  be used directly as operands, making op construction more concise.
+# Release 0.10.0
 
-## Breaking Changes to the API
-* `env.h` replaces use of `New*File()` functions to use `std::unique_ptr`
-  return arguments, removing the old raw pointer returns.
+## Major Features and Improvements
+
+* Added support for C++ shape inference
+* Added graph-construction C API
+* Major revision to the graph-construction C++ API
+* Support makefile build for iOS
+* Added Mac GPU support
+* Full version of TF-Slim available as `tf.contrib.slim`
+* Added k-Means clustering and WALS matrix factorization
+
+## Big Fixes and Other Changes
+
+* Allow gradient computation for scalar values.
+* Performance improvements for gRPC
+* Improved support for fp16
+* New high-level ops in tf.contrib.{layers,metrics}
+* New features for TensorBoard, such as shape display, exponential smoothing
+* Faster and more stable Google Cloud Storage (GCS) filesystem support
+* Support for zlib compression and decompression for TFRecordReader and TFRecordWriter
+* Support for reading (animated) GIFs
+* Improved support for SparseTensor
+* Added support for more probability distributions (Dirichlet, Beta, Bernoulli, etc.)
+* Added Python interfaces to reset resource containers.
+* Many bugfixes and performance improvements
+* Many documentation fixes
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+Alex Rothberg, Andrew Royer, Austin Marshall, @BlackCoal, Bob Adolf, Brian Diesel, Charles-Emmanuel Dias, @chemelnucfin, Chris Lesniewski, Daeyun Shin, Daniel Rodriguez, Danijar Hafner, Darcy Liu, Kristinn R. Thórisson, Daniel Castro, Dmitry Savintsev, Kashif Rasul, Dylan Paiton, Emmanuel T. Odeke, Ernest Grzybowski, Gavin Sherry, Gideon Dresdner, Gregory King, Harold Cooper, @heinzbeinz, Henry Saputra, Huarong Huo, Huazuo Gao, Igor Babuschkin, Igor Macedo Quintanilha, Ivan Ukhov, James Fysh, Jan Wilken Dörrie, Jihun Choi, Johnny Lim, Jonathan Raiman, Justin Francis, @lilac, Li Yi, Marc Khoury, Marco Marchesi, Max Melnick, Micael Carvalho, @mikowals, Mostafa Gazar, Nico Galoppo, Nishant Agrawal, Petr Janda, Yuncheng Li, @raix852, Robert Rose, @Robin-des-Bois, Rohit Girdhar, Sam Abrahams, satok16, Sergey Kishchenko, Sharkd Tu, @shotat, Siddharth Agrawal, Simon Denel, @sono-bfio, SunYeop Lee, Thijs Vogels, @tobegit3hub, @Undo1, Wang Yang, Wenjian Huang, Yaroslav Bulatov, Yuan Tang, Yunfeng Wang, Ziming Dong
+
+We are also grateful to all who filed issues or helped resolve them, asked and 
+answered questions, and were part of inspiring discussions. 
 
 # Release 0.9.0
 
@@ -55,7 +79,7 @@
 
 This release contains contributions from many people at Google, as well as:
 
-Aaron Schumacher, Aidan Dang, Akihiko ITOH, Aki Sukegawa, Arbit Chen, Aziz Alto, Danijar Hafner, Erik Erwitt, Fabrizio Milo, Felix Maximilian Möller, Henry Saputra, Sung Kim, Igor Babuschkin, Jan Zikes, Jeremy Barnes, Jesper Steen Møller, Johannes Mayer, Justin Harris, Kashif Rasul, Kevin Robinson, Loo Rong Jie, Lucas Moura, Łukasz Bieniasz-Krzywiec, Mario Cho, Maxim Grechkin, Michael Heilman, Mostafa Rahmani, Mourad Mourafiq, @ninotoshi, Orion Reblitz-Richardson, Yuncheng Li, @raoqiyu, Robert DiPietro, Sam Abrahams, Sebastian Raschka, Siddharth Agrawal, @snakecharmer1024, Stephen Roller, Sung Kim, SunYeop Lee, Thijs Vogels, Till Hoffmann, Victor Melo, Ville Kallioniemi, Waleed Abdulla, Wenjian Huang, Yaroslav Bulatov, Yeison Rodriguez, Yuan (Terry) Tang, Yuxin Wu, @zhongzyd, Ziming Dong, Zohar Jackson
+Aaron Schumacher, Aidan Dang, Akihiko ITOH, Aki Sukegawa, Arbit Chen, Aziz Alto, Danijar Hafner, Erik Erwitt, Fabrizio Milo, Felix Maximilian Möller, Henry Saputra, Sung Kim, Igor Babuschkin, Jan Zikes, Jeremy Barnes, Jesper Steen Møller, Johannes Mayer, Justin Harris, Kashif Rasul, Kevin Robinson, Loo Rong Jie, Lucas Moura, Łukasz Bieniasz-Krzywiec, Mario Cho, Maxim Grechkin, Michael Heilman, Mostafa Rahmani, Mourad Mourafiq, @ninotoshi, Orion Reblitz-Richardson, Yuncheng Li, @raoqiyu, Robert DiPietro, Sam Abrahams, Sebastian Raschka, Siddharth Agrawal, @snakecharmer1024, Stephen Roller, Sung Kim, SunYeop Lee, Thijs Vogels, Till Hoffmann, Victor Melo, Ville Kallioniemi, Waleed Abdulla, Wenjian Huang, Yaroslav Bulatov, Yeison Rodriguez, Yuan Tang, Yuxin Wu, @zhongzyd, Ziming Dong, Zohar Jackson
 
 We are also grateful to all who filed issues or helped resolve them, asked and 
 answered questions, and were part of inspiring discussions. 
@@ -97,7 +121,7 @@ answered questions, and were part of inspiring discussions.
 
 This release contains contributions from many people at Google, as well as:
 
-Abhinav Upadhyay, Aggelos Avgerinos, Alan Wu, Alexander G. de G. Matthews, Aleksandr Yahnev, @amchercashin, Andy Kitchen, Aurelien Geron, Awni Hannun, @BanditCat, Bas Veeling, Cameron Chen, @cg31, Cheng-Lung Sung, Christopher Bonnett, Dan Becker, Dan Van Boxel, Daniel Golden, Danijar Hafner, Danny Goodman, Dave Decker, David Dao, David Kretch, Dongjoon Hyun, Dustin Dorroh, @e-lin, Eurico Doirado, Erik Erwitt, Fabrizio Milo, @gaohuazuo, Iblis Lin, Igor Babuschkin, Isaac Hodes, Isaac Turner, Iván Vallés, J Yegerlehner, Jack Zhang, James Wexler, Jan Zikes, Jay Young, Jeff Hodges, @jmtatsch, Johnny Lim, Jonas Meinertz Hansen, Kanit Wongsuphasawat, Kashif Rasul, Ken Shirriff, Kenneth Mitchner, Kenta Yonekura, Konrad Magnusson, Konstantin Lopuhin, @lahwran, @lekaha, @liyongsea, Lucas Adams, @makseq, Mandeep Singh, @manipopopo, Mark Amery, Memo Akten, Michael Heilman, Michael Peteuil, Nathan Daly, Nicolas Fauchereau, @ninotoshi, Olav Nymoen, @panmari, @papelita1234, Pedro Lopes, Pranav Sailesh Mani, RJ Ryan, Rob Culliton, Robert DiPietro, @ronrest, Sam Abrahams, Sarath Shekkizhar, Scott Graham, Sebastian Raschka, Sung Kim, Surya Bhupatiraju, Syed Ahmed, Till Hoffmann, @timsl, @urimend, @vesnica, Vlad Frolov, Vlad Zagorodniy, Wei-Ting Kuo, Wenjian Huang, William Dmitri Breaden Madden, Wladimir Schmidt, Yuwen Yan, Yuxin Wu, Yuya Kusakabe, @zhongzyd, @znah.
+Abhinav Upadhyay, Aggelos Avgerinos, Alan Wu, Alexander G. de G. Matthews, Aleksandr Yahnev, @amchercashin, Andy Kitchen, Aurelien Geron, Awni Hannun, @BanditCat, Bas Veeling, Cameron Chen, @cg31, Cheng-Lung Sung, Christopher Bonnett, Dan Becker, Dan Van Boxel, Daniel Golden, Danijar Hafner, Danny Goodman, Dave Decker, David Dao, David Kretch, Dongjoon Hyun, Dustin Dorroh, @e-lin, Eurico Doirado, Erik Erwitt, Fabrizio Milo, @gaohuazuo, Iblis Lin, Igor Babuschkin, Isaac Hodes, Isaac Turner, Iván Vallés, J Yegerlehner, Jack Zhang, James Wexler, Jan Zikes, Jay Young, Jeff Hodges, @jmtatsch, Johnny Lim, Jonas Meinertz Hansen, Kanit Wongsuphasawat, Kashif Rasul, Ken Shirriff, Kenneth Mitchner, Kenta Yonekura, Konrad Magnusson, Konstantin Lopuhin, @lahwran, @lekaha, @liyongsea, Lucas Adams, @makseq, Mandeep Singh, @manipopopo, Mark Amery, Memo Akten, Michael Heilman, Michael Peteuil, Nathan Daly, Nicolas Fauchereau, @ninotoshi, Olav Nymoen, @panmari, @papelita1234, Pedro Lopes, Pranav Sailesh Mani, RJ Ryan, Rob Culliton, Robert DiPietro, @ronrest, Sam Abrahams, Sarath Shekkizhar, Scott Graham, Sebastian Raschka, Sung Kim, Surya Bhupatiraju, Syed Ahmed, Till Hoffmann, @timsl, @urimend, @vesnica, Vlad Frolov, Vlad Zagorodniy, Wei-Ting Kuo, Wenjian Huang, William Dmitri Breaden Madden, Wladimir Schmidt, Yuan Tang, Yuwen Yan, Yuxin Wu, Yuya Kusakabe, @zhongzyd, @znah.
 
 We are also grateful to all who filed issues or helped resolve them, asked and 
 answered questions, and were part of inspiring discussions. 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 9bdb702f716..7f80bf94132 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -37,7 +37,10 @@ config_setting(
 
 package_group(
     name = "internal",
-    packages = ["//tensorflow/..."],
+    packages = [
+        "//learning/vis/...",
+        "//tensorflow/...",
+    ],
 )
 
 sh_binary(
@@ -71,6 +74,7 @@ filegroup(
     name = "all_opensource_files",
     data = [
         ":all_files",
+        "//tensorflow/c:all_files",
         "//tensorflow/cc:all_files",
         "//tensorflow/contrib:all_files",
         "//tensorflow/contrib/copy_graph:all_files",
@@ -103,6 +107,7 @@ filegroup(
         "//tensorflow/contrib/testing:all_files",
         "//tensorflow/contrib/util:all_files",
         "//tensorflow/core:all_files",
+        "//tensorflow/core/debug:all_files",
         "//tensorflow/core/distributed_runtime:all_files",
         "//tensorflow/core/distributed_runtime/rpc:all_files",
         "//tensorflow/core/kernels:all_files",
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
new file mode 100644
index 00000000000..fec8ca759ec
--- /dev/null
+++ b/tensorflow/c/BUILD
@@ -0,0 +1,95 @@
+# Description:
+# C API for TensorFlow, for use by client language bindings.
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_cuda_library",
+)
+
+# For platform specific build config
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_kernel_tests_linkstatic",
+)
+
+# -----------------------------------------------------------------------------
+# Public targets
+
+tf_cuda_library(
+    name = "c_api",
+    srcs = ["c_api.cc"],
+    hdrs = ["c_api.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cuda_library(
+    name = "tf_status_helper",
+    srcs = ["tf_status_helper.cc"],
+    hdrs = ["tf_status_helper.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":c_api",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cuda_library(
+    name = "checkpoint_reader",
+    srcs = ["checkpoint_reader.cc"],
+    hdrs = ["checkpoint_reader.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tf_status_helper",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+# -----------------------------------------------------------------------------
+# Tests
+
+tf_cc_test(
+    name = "c_api_test",
+    size = "small",
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":c_api",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:math",
+        "//third_party/eigen3",
+    ],
+)
+
+# -----------------------------------------------------------------------------
+# Google-internal targets.
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/core/client/tensor_c_api.cc b/tensorflow/c/c_api.cc
similarity index 99%
rename from tensorflow/core/client/tensor_c_api.cc
rename to tensorflow/c/c_api.cc
index 99e5d796817..58557e9ba29 100644
--- a/tensorflow/core/client/tensor_c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/public/tensor_c_api.h"
+#include "tensorflow/c/c_api.h"
 
 #include <memory>
 #include <vector>
@@ -482,7 +482,6 @@ static void TF_Run_Helper(
     result = session->PRun(handle, input_pairs, output_tensor_names, &outputs);
   }
   if (!result.ok()) {
-    LOG(ERROR) << result.error_message();
     status->status = result;
     return;
   }
diff --git a/tensorflow/core/public/tensor_c_api.h b/tensorflow/c/c_api.h
similarity index 99%
rename from tensorflow/core/public/tensor_c_api.h
rename to tensorflow/c/c_api.h
index 9f4f7adde91..9d0b979bb94 100644
--- a/tensorflow/core/public/tensor_c_api.h
+++ b/tensorflow/c/c_api.h
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// TODO(jeff,sanjay): Rename to tensorflow/public/c_api.h
-#ifndef TENSORFLOW_PUBLIC_TENSOR_C_API_H_
-#define TENSORFLOW_PUBLIC_TENSOR_C_API_H_
+#ifndef TENSORFLOW_C_C_API_H_
+#define TENSORFLOW_C_C_API_H_
 
 #include <stddef.h>
 #include <stdint.h>
@@ -699,4 +698,4 @@ extern TF_Buffer TF_GetOpList(TF_Library* lib_handle);
 } /* end extern "C" */
 #endif
 
-#endif  // TENSORFLOW_PUBLIC_TENSOR_C_API_H_
+#endif  // TENSORFLOW_C_C_API_H_
diff --git a/tensorflow/core/client/tensor_c_api_test.cc b/tensorflow/c/c_api_test.cc
similarity index 99%
rename from tensorflow/core/client/tensor_c_api_test.cc
rename to tensorflow/c/c_api_test.cc
index 0bbc22495aa..23963caba7e 100644
--- a/tensorflow/core/client/tensor_c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/public/tensor_c_api.h"
+#include "tensorflow/c/c_api.h"
 
 #include <vector>
 #include "tensorflow/core/framework/graph.pb_text.h"
diff --git a/tensorflow/core/util/checkpoint_reader.cc b/tensorflow/c/checkpoint_reader.cc
similarity index 97%
rename from tensorflow/core/util/checkpoint_reader.cc
rename to tensorflow/c/checkpoint_reader.cc
index ba252ecc926..dd9cb225598 100644
--- a/tensorflow/core/util/checkpoint_reader.cc
+++ b/tensorflow/c/checkpoint_reader.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/util/checkpoint_reader.h"
+#include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/env.h"
diff --git a/tensorflow/core/util/checkpoint_reader.h b/tensorflow/c/checkpoint_reader.h
similarity index 90%
rename from tensorflow/core/util/checkpoint_reader.h
rename to tensorflow/c/checkpoint_reader.h
index 65d1949ef49..fb06d6d8640 100644
--- a/tensorflow/core/util/checkpoint_reader.h
+++ b/tensorflow/c/checkpoint_reader.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_UTIL_CHECKPOINT_READER_H
-#define TENSORFLOW_CORE_UTIL_CHECKPOINT_READER_H
+#ifndef TENSORFLOW_C_CHECKPOINT_READER_H
+#define TENSORFLOW_C_CHECKPOINT_READER_H
 
+#include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/tensor_slice_reader.h"
-#include "tensorflow/core/util/tf_status_helper.h"
 
 namespace tensorflow {
 
@@ -60,4 +60,4 @@ class CheckpointReader {
 }  // namespace checkpoint
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_UTIL_CHECKPOINT_READER_H
+#endif  // TENSORFLOW_C_CHECKPOINT_READER_H
diff --git a/tensorflow/core/util/tf_status_helper.cc b/tensorflow/c/tf_status_helper.cc
similarity index 98%
rename from tensorflow/core/util/tf_status_helper.cc
rename to tensorflow/c/tf_status_helper.cc
index d119b9845cf..747fd672f08 100644
--- a/tensorflow/core/util/tf_status_helper.cc
+++ b/tensorflow/c/tf_status_helper.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/util/tf_status_helper.h"
+#include "tensorflow/c/tf_status_helper.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/tf_status_helper.h b/tensorflow/c/tf_status_helper.h
similarity index 82%
rename from tensorflow/core/util/tf_status_helper.h
rename to tensorflow/c/tf_status_helper.h
index b3cea3072c4..4bc56f9cb40 100644
--- a/tensorflow/core/util/tf_status_helper.h
+++ b/tensorflow/c/tf_status_helper.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_UTIL_TF_STATUS_HELPER_H
-#define TENSORFLOW_CORE_UTIL_TF_STATUS_HELPER_H
+#ifndef TENSORFLOW_C_TF_STATUS_HELPER_H
+#define TENSORFLOW_C_TF_STATUS_HELPER_H
 
+#include "tensorflow/c/c_api.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/public/tensor_c_api.h"
 
 namespace tensorflow {
 
@@ -26,4 +26,4 @@ void Set_TF_Status_from_Status(TF_Status* tf_status, const Status& status);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_UTIL_TF_STATUS_HELPER_H
+#endif  // TENSORFLOW_C_TF_STATUS_HELPER_H
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 7c347c6cf67..8e6c3d7e62b 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -73,6 +73,46 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "grad_op_registry",
+    srcs = ["framework/grad_op_registry.cc"],
+    hdrs = ["framework/grad_op_registry.h"],
+    deps = [
+        ":ops",
+        ":scope",
+    ],
+)
+
+cc_library(
+    name = "math_grad",
+    srcs = ["gradients/math_grad.cc"],
+    deps = [
+        ":cc_ops",
+        ":grad_op_registry",
+        ":ops",
+        ":scope",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+    ],
+)
+
+tf_cc_test(
+    name = "gradients/math_grad_test",
+    deps = [
+        ":cc_ops",
+        ":grad_op_registry",
+        ":math_grad",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_gen_op_wrappers_cc(
     name = "cc_ops",
     op_lib_names = [
diff --git a/tensorflow/cc/framework/grad_op_registry.cc b/tensorflow/cc/framework/grad_op_registry.cc
new file mode 100644
index 00000000000..b83e7de61c6
--- /dev/null
+++ b/tensorflow/cc/framework/grad_op_registry.cc
@@ -0,0 +1,42 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/framework/grad_op_registry.h"
+
+namespace tensorflow {
+namespace ops {
+
+// static
+GradOpRegistry* GradOpRegistry::Global() {
+  static GradOpRegistry* grad_op_registry = new GradOpRegistry;
+  return grad_op_registry;
+}
+
+bool GradOpRegistry::Register(const string& op, GradFunc func) {
+  CHECK(registry_.insert({op, func}).second) << "Existing gradient for " << op;
+  return true;
+}
+
+Status GradOpRegistry::Lookup(const string& op, GradFunc* func) {
+  auto iter = registry_.find(op);
+  if (iter == registry_.end()) {
+    return errors::NotFound("No gradient defined for op: ", op);
+  }
+  *func = iter->second;
+  return Status::OK();
+}
+
+}  // end namespace ops
+}  // namespace tensorflow
diff --git a/tensorflow/cc/framework/grad_op_registry.h b/tensorflow/cc/framework/grad_op_registry.h
new file mode 100644
index 00000000000..b8a15219e52
--- /dev/null
+++ b/tensorflow/cc/framework/grad_op_registry.h
@@ -0,0 +1,75 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
+#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
+
+#include <unordered_map>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+
+namespace tensorflow {
+namespace ops {
+
+// GradFunc is the signature for all gradient functions in GradOpRegistry.
+// Implementations should add operations to compute the gradient outputs of 'op'
+// (returned in 'grad_outputs') using 'scope' and 'grad_inputs'.
+typedef Status (*GradFunc)(const Scope& scope, const Operation& op,
+                           const std::vector<Output>& grad_inputs,
+                           std::vector<Output>* grad_outputs);
+
+// GradOpRegistry maintains a static registry of gradient functions.
+// Gradient functions are indexed in the registry by the forward op name (i.e.
+// "MatMul" -> MatMulGrad func).
+class GradOpRegistry {
+ public:
+  // Registers 'func' as the the gradient function for 'op'.
+  // Returns true if registration was succesful, check fails otherwise.
+  bool Register(const string& op, GradFunc func);
+
+  // Sets 'func' to the gradient function for 'op' and returns Status OK if
+  // the gradient function for 'op' exists in the registry.
+  // Note that 'func' can be null for ops that have registered no-gradient with
+  // the registry.
+  // Returns error status otherwise.
+  Status Lookup(const string& op, GradFunc* func);
+
+  // Returns a pointer to the global gradient function registry.
+  static GradOpRegistry* Global();
+
+ private:
+  std::unordered_map<string, GradFunc> registry_;
+};
+
+}  // namespace ops
+
+// Macros used to define gradient functions for ops.
+#define REGISTER_GRADIENT_OP(name, fn) \
+  REGISTER_GRADIENT_OP_UNIQ_HELPER(__COUNTER__, name, fn)
+
+#define REGISTER_NO_GRADIENT_OP(name) \
+  REGISTER_GRADIENT_OP_UNIQ_HELPER(__COUNTER__, name, nullptr)
+
+#define REGISTER_GRADIENT_OP_UNIQ_HELPER(ctr, name, fn) \
+  REGISTER_GRADIENT_OP_UNIQ(ctr, name, fn)
+
+#define REGISTER_GRADIENT_OP_UNIQ(ctr, name, fn) \
+  static bool unused_ret_val_##ctr =             \
+      ::tensorflow::ops::GradOpRegistry::Global()->Register(name, fn)
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
diff --git a/tensorflow/cc/framework/ops.cc b/tensorflow/cc/framework/ops.cc
index dd107a57fd0..52ef39f5070 100644
--- a/tensorflow/cc/framework/ops.cc
+++ b/tensorflow/cc/framework/ops.cc
@@ -18,6 +18,44 @@ limitations under the License.
 namespace tensorflow {
 namespace ops {
 
+Operation::Operation(Node* n) : inputs_(GetInputs(n)), node_(n) {}
+
+Output Operation::input(int i) const {
+  CHECK_NOTNULL(node_);
+  CHECK_GE(i, 0);
+  CHECK_LT(i, node_->num_inputs());
+  // Handle the case where the input was unknown at the time this
+  // Operation was constructed.
+  if (inputs_[i].first == nullptr && inputs_[i].second == -1) {
+    for (const Edge* e : node_->in_edges()) {
+      if (e->IsControlEdge()) continue;
+      if (e->dst_input() == i) {
+        return Output(e->src(), e->src_output());
+      }
+    }
+  }
+  return Output(inputs_[i].first, inputs_[i].second);
+}
+
+Output Operation::output(int i) const {
+  CHECK_NOTNULL(node_);
+  CHECK_GE(i, 0);
+  CHECK_LT(i, node_->num_outputs());
+  return Output(node_, i);
+}
+
+Operation::Inputs Operation::GetInputs(Node* node) {
+  Operation::Inputs inputs;
+  if (node != nullptr) {
+    inputs.resize(node->num_inputs(), {nullptr, -1});
+    for (const Edge* e : node->in_edges()) {
+      if (e->IsControlEdge()) continue;
+      inputs[e->dst_input()] = std::make_pair(e->src(), e->src_output());
+    }
+  }
+  return inputs;
+}
+
 Input::Initializer::Initializer(
     const std::initializer_list<Input::Initializer>& v) {
   if (v.size() < 1) {
diff --git a/tensorflow/cc/framework/ops.h b/tensorflow/cc/framework/ops.h
index 1737f043cb0..517598d9e86 100644
--- a/tensorflow/cc/framework/ops.h
+++ b/tensorflow/cc/framework/ops.h
@@ -27,17 +27,29 @@ limitations under the License.
 namespace tensorflow {
 namespace ops {
 
+class Output;
+
 // Represents a node in the computation graph.
 class Operation {
  public:
   Operation() : node_(nullptr) {}
-  explicit Operation(Node* n) : node_(n) {}
+  explicit Operation(Node* n);
+
+  int num_inputs() const { return node_->num_inputs(); }
+  DataType input_type(int o) const { return node_->input_type(o); }
+  Output input(int i) const;
 
   int num_outputs() const { return node_->num_outputs(); }
   DataType output_type(int o) const { return node_->output_type(o); }
+  Output output(int i) const;
+
   Node* node() const { return node_; }
 
  private:
+  typedef std::vector<std::pair<Node*, int64>> Inputs;
+  static Inputs GetInputs(Node* node);
+
+  Inputs inputs_;
   Node* node_;
 };
 
@@ -81,7 +93,7 @@ class Input {
       tensor = t;
     }
 
-    explicit Initializer(const Tensor& t) : tensor(t) {}
+    Initializer(const Tensor& t) : tensor(t) {}  // NOLINT(runtime/explicit)
 
     // Construct from a scalar value and an explicit shape
     template <typename T, typename = typename std::enable_if<
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
new file mode 100644
index 00000000000..85093015b7c
--- /dev/null
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -0,0 +1,91 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/standard_ops.h"
+
+#include "tensorflow/cc/framework/grad_op_registry.h"
+
+namespace tensorflow {
+namespace ops {
+namespace {
+
+// TODO(andydavis) Move this to a more appropriate file.
+REGISTER_NO_GRADIENT_OP("Const");
+
+// MatMulGrad helper function used to compute two MatMul operations
+// based on input matrix transposition combinations.
+Status MatMulGradHelper(const Scope& scope, const Output& x0, const bool adj_x0,
+                        const Output& x1, const bool adj_x1, const Output& y0,
+                        const bool adj_y0, const Output& y1, const bool adj_y1,
+                        std::vector<Output>* grad_outputs) {
+  auto dx =
+      MatMul(scope, x0, x1, MatMul::TransposeA(adj_x0).TransposeB(adj_x1));
+  grad_outputs->push_back(dx);
+  auto dy =
+      MatMul(scope, y0, y1, MatMul::TransposeA(adj_y0).TransposeB(adj_y1));
+  grad_outputs->push_back(dy);
+  return Status::OK();
+}
+
+// MatMulGrad common used to read and check node attr state, and determine
+// proper MatMul products for gradients based on input matrix transposition
+// combinations.
+// TODO(andydavis) Re-use this function for BatchMatMulGrad.
+Status MatMulGradCommon(const Scope& scope, const Operation& op,
+                        const std::vector<Output>& grad_inputs,
+                        const string& attr_adj_x, const string& attr_adj_y,
+                        std::vector<Output>* grad_outputs) {
+  DataType dtype;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), "T", &dtype));
+  if (dtype == DT_COMPLEX64 || dtype == DT_COMPLEX128) {
+    return errors::Unimplemented(
+        "MatMul gradient for complex data type is not supported yet.");
+  }
+
+  bool ta;
+  bool tb;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), attr_adj_x, &ta));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), attr_adj_y, &tb));
+
+  if (!ta && !tb) {
+    return MatMulGradHelper(scope, grad_inputs[0], false, op.input(1), true,
+                            op.input(0), true, grad_inputs[0], false,
+                            grad_outputs);
+  } else if (!ta && tb) {
+    return MatMulGradHelper(scope, grad_inputs[0], false, op.input(1), false,
+                            grad_inputs[0], true, op.input(0), false,
+                            grad_outputs);
+  } else if (ta && !tb) {
+    return MatMulGradHelper(scope, op.input(1), false, grad_inputs[0], true,
+                            op.input(0), false, grad_inputs[0], false,
+                            grad_outputs);
+  }
+  return MatMulGradHelper(scope, op.input(1), true, grad_inputs[0], true,
+                          grad_inputs[0], true, op.input(0), true,
+                          grad_outputs);
+}
+
+Status MatMulGrad(const Scope& scope, const Operation& op,
+                  const std::vector<Output>& grad_inputs,
+                  std::vector<Output>* grad_outputs) {
+  return MatMulGradCommon(scope, op, grad_inputs, "transpose_a", "transpose_b",
+                          grad_outputs);
+}
+
+REGISTER_GRADIENT_OP("MatMul", MatMulGrad);
+
+}  // anonymous namespace
+}  // namespace ops
+}  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
new file mode 100644
index 00000000000..993316d7628
--- /dev/null
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/framework/grad_op_registry.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+using namespace ops;  // NOLINT(build/namespaces)
+
+namespace {
+
+// TODO(andydavis) Test gradient function against numeric gradients output.
+// TODO(andydavis) As more gradients are added move common test functions
+// to a testutil library.
+class MathGradTest : public ::testing::Test {
+ protected:
+  MathGradTest() : root_(Scope::NewRootScope()) {}
+
+  void ComputeMatMulGrad(const Output& x, const bool t_x, const Output& y,
+                         const bool t_y, const Output& dz,
+                         std::vector<Tensor>* out) {
+    // Compute forward MatMul: z = MatMul(x, y).
+    auto z = MatMul(root_, x, y, MatMul::TransposeA(t_x).TransposeB(t_y));
+    TF_EXPECT_OK(root_.status());
+    CHECK_NOTNULL(z.node());
+    std::vector<Output> grad_outputs;
+    // Call MatMulGrad which populates 'grad_outputs'.
+    CallGradFunction(Operation(z.node()), {dz}, &grad_outputs);
+    EXPECT_EQ(2, grad_outputs.size());
+    // Run graph and return MatMul gradient tensors for 'dx' and 'dy' in 'out'.
+    GetTensors(root_, {grad_outputs[0], grad_outputs[1]}, out);
+  }
+
+  void CallGradFunction(const Operation& op,
+                        const std::vector<Output>& grad_inputs,
+                        std::vector<Output>* grad_outputs) {
+    GradFunc grad_fn;
+    TF_EXPECT_OK(GradOpRegistry::Global()->Lookup(op.node()->name(), &grad_fn));
+    TF_EXPECT_OK(grad_fn(root_, op, grad_inputs, grad_outputs));
+    TF_EXPECT_OK(root_.status());
+  }
+
+  Tensor ComputeMatMul(const Output& x, const bool t_x, const Output& y,
+                       const bool t_y) {
+    auto z = MatMul(root_, x, y, MatMul::TransposeA(t_x).TransposeB(t_y));
+    TF_EXPECT_OK(root_.status());
+    Tensor out;
+    GetTensor(root_, z, &out);
+    return out;
+  }
+
+  void RandMatMulGradData(const bool tx, const bool ty,
+                          std::vector<Tensor>* data) {
+    // z = MatMul(x, y)
+    const int m = Rand();
+    const int k = Rand();
+    const int n = Rand();
+    // x.shape = [m, k]
+    const TensorShape x_shape = tx ? TensorShape({k, m}) : TensorShape({m, k});
+    data->emplace_back(DT_FLOAT, x_shape);
+    RandTensor(&data->back());
+    // y.shape = [k, n]
+    const TensorShape y_shape = ty ? TensorShape({n, k}) : TensorShape({k, n});
+    data->emplace_back(DT_FLOAT, y_shape);
+    RandTensor(&data->back());
+    // z.shape = [m, n]
+    data->emplace_back(DT_FLOAT, TensorShape({m, n}));
+    RandTensor(&data->back());
+  }
+
+  void RandTensor(Tensor* t) {
+    test::FillFn<float>(
+        t, [this](const int i) { return static_cast<float>(Rand()); });
+  }
+
+  int Rand() { return 1 + (random::New64() % 10); }
+
+  // TODO(andydavis) Move 'GetTensors/GetTensor' to some testutil class.
+  // Note: they should be moved to a general/non-grad specific testutil class.
+  void GetTensors(const Scope& scope, OutputList tensors,
+                  std::vector<Tensor>* out) {
+    SessionOptions options;
+    std::unique_ptr<Session> session(NewSession(options));
+    GraphDef def;
+    scope.graph()->ToGraphDef(&def);
+
+    graph::SetDefaultDevice("/cpu:0", &def);
+
+    TF_CHECK_OK(session->Create(def));
+    std::vector<string> names;
+    for (const auto& t : tensors) {
+      names.push_back(strings::StrCat(t.node()->name(), ":", t.index()));
+    }
+    TF_CHECK_OK(session->Run({}, names, {}, out));
+    TF_CHECK_OK(session->Close());
+  }
+
+  void GetTensor(const Scope& scope, Output tensor, Tensor* out) {
+    std::vector<Tensor> outputs;
+    GetTensors(scope, {tensor}, &outputs);
+    *out = outputs[0];
+  }
+
+  Scope root_;
+};
+
+TEST_F(MathGradTest, MatMulGrad_NoTranspose) {
+  std::vector<Tensor> data;
+  RandMatMulGradData(false, false, &data);
+  auto x = Const(root_, data[0]);
+  auto y = Const(root_, data[1]);
+  auto dz = Const(root_, data[2]);
+
+  std::vector<Tensor> grad_outputs;
+  ComputeMatMulGrad(x, false, y, false, dz, &grad_outputs);
+
+  test::ExpectClose(grad_outputs[0], ComputeMatMul(dz, false, y, true));
+  test::ExpectClose(grad_outputs[1], ComputeMatMul(x, true, dz, false));
+}
+
+TEST_F(MathGradTest, MatMulGrad_TransposeX) {
+  std::vector<Tensor> data;
+  RandMatMulGradData(true, false, &data);
+  auto x = Const(root_, data[0]);
+  auto y = Const(root_, data[1]);
+  auto dz = Const(root_, data[2]);
+
+  std::vector<Tensor> grad_outputs;
+  ComputeMatMulGrad(x, true, y, false, dz, &grad_outputs);
+
+  test::ExpectClose(grad_outputs[0], ComputeMatMul(y, false, dz, true));
+  test::ExpectClose(grad_outputs[1], ComputeMatMul(x, false, dz, false));
+}
+
+TEST_F(MathGradTest, MatMulGrad_TransposeY) {
+  std::vector<Tensor> data;
+  RandMatMulGradData(false, true, &data);
+  auto x = Const(root_, data[0]);
+  auto y = Const(root_, data[1]);
+  auto dz = Const(root_, data[2]);
+
+  std::vector<Tensor> grad_outputs;
+  ComputeMatMulGrad(x, false, y, true, dz, &grad_outputs);
+
+  test::ExpectClose(grad_outputs[0], ComputeMatMul(dz, false, y, false));
+  test::ExpectClose(grad_outputs[1], ComputeMatMul(dz, true, x, false));
+}
+
+TEST_F(MathGradTest, MatMulGrad_TransposeX_TransposeY) {
+  std::vector<Tensor> data;
+  RandMatMulGradData(true, true, &data);
+  auto x = Const(root_, data[0]);
+  auto y = Const(root_, data[1]);
+  auto dz = Const(root_, data[2]);
+
+  std::vector<Tensor> grad_outputs;
+  ComputeMatMulGrad(x, true, y, true, dz, &grad_outputs);
+
+  test::ExpectClose(grad_outputs[0], ComputeMatMul(y, true, dz, true));
+  test::ExpectClose(grad_outputs[1], ComputeMatMul(dz, true, x, true));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index c449c803fa2..e5a0790ff4a 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -39,6 +39,7 @@ set (DOWNLOAD_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/downloads"
 mark_as_advanced(DOWNLOAD_LOCATION)
 
 # External dependencies
+include(gif)
 include(png)
 include(jpeg)
 include(re2)
diff --git a/tensorflow/contrib/cmake/external/gif.cmake b/tensorflow/contrib/cmake/external/gif.cmake
new file mode 100644
index 00000000000..9fdaeec13f6
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/gif.cmake
@@ -0,0 +1,38 @@
+include (ExternalProject)
+
+set(gif_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/gif_archive)
+set(gif_URL http://ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz)
+set(gif_HASH SHA256=34a7377ba834397db019e8eb122e551a49c98f49df75ec3fcc92b9a794a4f6d1)
+set(gif_INSTALL ${CMAKE_BINARY_DIR}/gif/install)
+set(gif_STATIC_LIBRARIES ${gif_INSTALL}/lib/libgif.a)
+
+set(gif_HEADERS
+    "${gif_INSTALL}/include/gif_lib.h"
+)
+
+ExternalProject_Add(gif
+    PREFIX gif
+    URL ${gif_URL}
+    URL_HASH ${gif_HASH}
+    INSTALL_DIR ${gif_INSTALL}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    BUILD_COMMAND $(MAKE)
+    INSTALL_COMMAND $(MAKE) install
+    CONFIGURE_COMMAND
+    ${CMAKE_CURRENT_BINARY_DIR}/gif/src/gif/configure
+    --prefix=${gif_INSTALL}
+    --enable-shared=yes
+)
+
+# put gif includes in the directory where they are expected
+add_custom_target(gif_create_destination_dir
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${gif_INCLUDE_DIR}/giflib-5.1.4/lib
+    DEPENDS gif)
+
+add_custom_target(gif_copy_headers_to_destination
+    DEPENDS gif_create_destination_dir)
+
+foreach(header_file ${gif_HEADERS})
+    add_custom_command(TARGET gif_copy_headers_to_destination PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${header_file} ${gif_INCLUDE_DIR}/giflib-5.1.4/lib/)
+endforeach()
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index 6ea9860992c..42eeef39e2d 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -1,10 +1,39 @@
+########################################################
+# tf_cc_framework library
+########################################################
+set(tf_cc_framework_srcs
+    "${tensorflow_source_dir}/tensorflow/cc/framework/ops.h"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/ops.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/scope.h"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/scope.cc"
+)
+
+add_library(tf_cc_framework OBJECT ${tf_cc_framework_srcs})
+
+add_dependencies(tf_cc_framework tf_core_framework)
+
+target_include_directories(tf_cc_framework PRIVATE
+    ${tensorflow_source_dir}
+    ${eigen_INCLUDE_DIRS}
+)
+
+target_compile_options(tf_cc_framework PRIVATE
+    -fno-exceptions
+    -DEIGEN_AVOID_STL_ARRAY
+)
+
+# C++11
+target_compile_features(tf_cc_framework PRIVATE
+    cxx_rvalue_references
+)
+
 ########################################################
 # tf_cc_op_gen_main library
 ########################################################
 set(tf_cc_op_gen_main_srcs
-    "${tensorflow_source_dir}/tensorflow/cc/ops/cc_op_gen.cc"
-    "${tensorflow_source_dir}/tensorflow/cc/ops/cc_op_gen_main.cc"
-    "${tensorflow_source_dir}/tensorflow/cc/ops/cc_op_gen.h"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/cc_op_gen.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/cc_op_gen_main.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/cc_op_gen.h"
 )
 
 add_library(tf_cc_op_gen_main OBJECT ${tf_cc_op_gen_main_srcs})
@@ -120,6 +149,7 @@ foreach(tf_cc_op_lib_name ${tf_cc_op_lib_names})
         ${PROTOBUF_LIBRARIES}
         tf_protos_cc
         re2_lib
+        ${gif_STATIC_LIBRARIES}
         ${jpeg_STATIC_LIBRARIES}
         ${png_STATIC_LIBRARIES}
         ${ZLIB_LIBRARIES}
diff --git a/tensorflow/contrib/cmake/tf_core_direct_session.cmake b/tensorflow/contrib/cmake/tf_core_direct_session.cmake
index bafc7e1e630..ba274d7f798 100644
--- a/tensorflow/contrib/cmake/tf_core_direct_session.cmake
+++ b/tensorflow/contrib/cmake/tf_core_direct_session.cmake
@@ -4,8 +4,17 @@
 file(GLOB tf_core_direct_session_srcs
    "${tensorflow_source_dir}/tensorflow/core/common_runtime/direct_session.cc"
    "${tensorflow_source_dir}/tensorflow/core/common_runtime/direct_session.h"
+   "${tensorflow_source_dir}/tensorflow/core/debug/*.h"
+   "${tensorflow_source_dir}/tensorflow/core/debug/*.cc"
 )
 
+file(GLOB_RECURSE tf_core_direct_session_test_srcs
+    "${tensorflow_source_dir}/tensorflow/core/debug/*test*.h"
+    "${tensorflow_source_dir}/tensorflow/core/debug/*test*.cc"
+)
+
+list(REMOVE_ITEM tf_core_direct_session_srcs ${tf_core_direct_session_test_srcs})
+
 add_library(tf_core_direct_session OBJECT ${tf_core_direct_session_srcs})
 
 add_dependencies(tf_core_direct_session tf_core_cpu)
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 3e6ec3c389e..c4828823556 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -150,6 +150,7 @@ list(REMOVE_ITEM tf_core_lib_srcs ${tf_core_lib_test_srcs})
 add_library(tf_core_lib OBJECT ${tf_core_lib_srcs})
 target_include_directories(tf_core_lib PUBLIC
     ${tensorflow_source_dir}
+    ${gif_INCLUDE_DIR}
     ${jpeg_INCLUDE_DIR}
     ${png_INCLUDE_DIR}
     ${eigen_INCLUDE_DIRS}
@@ -168,6 +169,7 @@ target_compile_features(tf_core_lib PRIVATE
 )
 
 add_dependencies(tf_core_lib
+    gif_copy_headers_to_destination
     jpeg_copy_headers_to_destination
     png_copy_headers_to_destination
     re2_copy_headers_to_destination
diff --git a/tensorflow/contrib/cmake/tf_models.cmake b/tensorflow/contrib/cmake/tf_models.cmake
index ff3f5afbbaa..940492771a6 100644
--- a/tensorflow/contrib/cmake/tf_models.cmake
+++ b/tensorflow/contrib/cmake/tf_models.cmake
@@ -71,7 +71,7 @@ target_include_directories(tf_models_word2vec_kernels PRIVATE
     ${re2_INCLUDES}
 )
 
-add_dependencies(tf_models_word2vec_ops
+add_dependencies(tf_models_word2vec_kernels
     tf_core_cpu
 )
 
diff --git a/tensorflow/contrib/cmake/tf_tools.cmake b/tensorflow/contrib/cmake/tf_tools.cmake
index 3c1fe5eac75..5022dfee0d6 100644
--- a/tensorflow/contrib/cmake/tf_tools.cmake
+++ b/tensorflow/contrib/cmake/tf_tools.cmake
@@ -22,6 +22,7 @@ target_link_libraries(${proto_text} PUBLIC
     ${PROTOBUF_LIBRARIES}
     # tf_protos_cc
     # re2_lib
+    ${gif_STATIC_LIBRARIES}
     ${jpeg_STATIC_LIBRARIES}
     ${png_STATIC_LIBRARIES}
     ${ZLIB_LIBRARIES}
diff --git a/tensorflow/contrib/cmake/tf_tutorials.cmake b/tensorflow/contrib/cmake/tf_tutorials.cmake
index 11dfd4739b4..7f18491d3fc 100644
--- a/tensorflow/contrib/cmake/tf_tutorials.cmake
+++ b/tensorflow/contrib/cmake/tf_tutorials.cmake
@@ -23,6 +23,7 @@ add_executable(tf_tutorials_example_trainer
     $<TARGET_OBJECTS:tf_core_cpu>
     $<TARGET_OBJECTS:tf_core_framework>
     $<TARGET_OBJECTS:tf_core_kernels>
+    $<TARGET_OBJECTS:tf_cc_framework>
     $<TARGET_OBJECTS:tf_cc_ops>
     $<TARGET_OBJECTS:tf_core_ops>
     $<TARGET_OBJECTS:tf_core_direct_session>
@@ -40,6 +41,7 @@ target_link_libraries(tf_tutorials_example_trainer PUBLIC
     re2_lib
     ${boringssl_STATIC_LIBRARIES}
     ${farmhash_STATIC_LIBRARIES}
+    ${gif_STATIC_LIBRARIES}
     ${jpeg_STATIC_LIBRARIES}
     ${jsoncpp_STATIC_LIBRARIES}
     ${png_STATIC_LIBRARIES}
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 4027ae3ef88..2d5a708bac6 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -54,6 +54,29 @@ cuda_py_tests(
     ],
 )
 
+cuda_py_tests(
+    name = "operator_pd_identity_test",
+    size = "small",
+    srcs = ["python/kernel_tests/operator_pd_identity_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_tests(
+    name = "operator_pd_vdvt_update_test",
+    size = "large",
+    srcs = ["python/kernel_tests/operator_pd_vdvt_update_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = ["notap"],  # http://b/30441813
+)
+
 py_library(
     name = "distributions_py",
     srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
@@ -76,7 +99,16 @@ cuda_py_tests(
     srcs = ["python/kernel_tests/beta_test.py"],
     additional_deps = [
         ":distributions_py",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_tests(
+    name = "binomial_test",
+    size = "small",
+    srcs = ["python/kernel_tests/binomial_test.py"],
+    additional_deps = [
+        ":distributions_py",
         "//tensorflow/python:platform_test",
     ],
     tags = ["notsan"],
@@ -156,9 +188,8 @@ cuda_py_tests(
 )
 
 cuda_py_tests(
-    name = "kullback_leibler_test",
-    size = "small",
-    srcs = ["python/kernel_tests/kullback_leibler_test.py"],
+    name = "laplace_test",
+    srcs = ["python/kernel_tests/laplace_test.py"],
     additional_deps = [
         ":distributions_py",
         "//tensorflow/python:framework_test_lib",
@@ -167,13 +198,14 @@ cuda_py_tests(
 )
 
 cuda_py_tests(
-    name = "laplace_test",
-    srcs = ["python/kernel_tests/laplace_test.py"],
+    name = "multinomial_test",
+    srcs = ["python/kernel_tests/multinomial_test.py"],
     additional_deps = [
         ":distributions_py",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
+    tags = ["notsan"],
 )
 
 cuda_py_tests(
@@ -216,6 +248,15 @@ cuda_py_tests(
     srcs = ["python/kernel_tests/uniform_test.py"],
     additional_deps = [
         ":distributions_py",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+cuda_py_tests(
+    name = "kullback_leibler_test",
+    size = "small",
+    srcs = ["python/kernel_tests/kullback_leibler_test.py"],
+    additional_deps = [
         "//tensorflow/python:platform_test",
     ],
 )
@@ -240,6 +281,28 @@ cuda_py_tests(
     ],
 )
 
+cuda_py_tests(
+    name = "shape_test",
+    size = "small",
+    srcs = ["python/kernel_tests/shape_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_tests(
+    name = "bijector_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijector_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 2b32556f3eb..83719157761 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -25,6 +25,7 @@ initialized with parameters that define the distributions.
 
 ### Univariate (scalar) distributions
 
+@@Binomial
 @@Bernoulli
 @@Beta
 @@Categorical
@@ -50,6 +51,7 @@ initialized with parameters that define the distributions.
 
 @@Dirichlet
 @@DirichletMultinomial
+@@Multinomial
 
 ### Transformed distributions
 
@@ -79,6 +81,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops.bernoulli import *
 from tensorflow.contrib.distributions.python.ops.beta import *
+from tensorflow.contrib.distributions.python.ops.binomial import *
 from tensorflow.contrib.distributions.python.ops.categorical import *
 from tensorflow.contrib.distributions.python.ops.chi2 import *
 from tensorflow.contrib.distributions.python.ops.dirichlet import *
@@ -89,6 +92,7 @@ from tensorflow.contrib.distributions.python.ops.gamma import *
 from tensorflow.contrib.distributions.python.ops.inverse_gamma import *
 from tensorflow.contrib.distributions.python.ops.kullback_leibler import *
 from tensorflow.contrib.distributions.python.ops.laplace import *
+from tensorflow.contrib.distributions.python.ops.multinomial import *
 from tensorflow.contrib.distributions.python.ops.mvn import *
 from tensorflow.contrib.distributions.python.ops.normal import *
 from tensorflow.contrib.distributions.python.ops.normal_conjugate_posteriors import *
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
index c636a4d060c..82f77fbfd1e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
@@ -57,10 +57,17 @@ class BernoulliTest(tf.test.TestCase):
       self.assertAllClose(scipy.special.logit(p), dist.logits.eval())
 
   def testInvalidP(self):
-    invalid_ps = [1.01, -0.01, 2., -3.]
+    invalid_ps = [1.01, 2.]
     for p in invalid_ps:
       with self.test_session():
-        with self.assertRaisesOpError("x <= y"):
+        with self.assertRaisesOpError("p has components greater than 1"):
+          dist = tf.contrib.distributions.Bernoulli(p=p)
+          dist.p.eval()
+
+    invalid_ps = [-0.01, -3.]
+    for p in invalid_ps:
+      with self.test_session():
+        with self.assertRaisesOpError("Condition x >= 0"):
           dist = tf.contrib.distributions.Bernoulli(p=p)
           dist.p.eval()
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijector_test.py
new file mode 100644
index 00000000000..fd2cf58fd29
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijector_test.py
@@ -0,0 +1,67 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import tensorflow as tf
+
+from tensorflow.contrib.distributions.python.ops.bijector import _Exp  # pylint: disable=line-too-long
+from tensorflow.contrib.distributions.python.ops.bijector import _Identity  # pylint: disable=line-too-long
+from tensorflow.contrib.distributions.python.ops.shape import _ShapeUtil  # pylint: disable=line-too-long
+
+
+class IdentityBijectorTest(tf.test.TestCase):
+  """Tests the correctness of the Y = g(X) = X transformation."""
+
+  def testBijector(self):
+    with self.test_session():
+      bijector = _Identity(_ShapeUtil(batch_ndims=1, event_ndims=1))
+      self.assertEqual(bijector.name, 'Identity')
+      x = [[[0.], [1]]]
+      self.assertAllEqual(bijector.forward(x).eval(), x)
+      self.assertAllEqual(bijector.inverse(x).eval(), x)
+      self.assertAllEqual(bijector.inverse_log_det_jacobian(x).eval(),
+                          [[0., 0]])
+      rev, jac = bijector.inverse_and_inverse_log_det_jacobian(x)
+      self.assertAllEqual(rev.eval(), x)
+      self.assertAllEqual(jac.eval(), [[0., 0]])
+
+
+class ExpBijectorTest(tf.test.TestCase):
+  """Tests the correctness of the Y = g(X) = exp(X) transformation."""
+
+  def testBijector(self):
+    with self.test_session():
+      bijector = _Exp(_ShapeUtil(batch_ndims=1, event_ndims=1))
+      self.assertEqual(bijector.name, 'Exp')
+      x = [[[1.], [2]]]
+      self.assertAllClose(bijector.forward(x).eval(),
+                          [[[math.exp(1.)], [math.exp(2.)]]])
+      self.assertAllClose(bijector.inverse(x).eval(),
+                          [[[math.log(1.)], [math.log(2.)]]])
+      self.assertAllClose(bijector.inverse_log_det_jacobian(x).eval(),
+                          [[0., -math.log(2.)]])
+      rev, jac = bijector.inverse_and_inverse_log_det_jacobian(x)
+      self.assertAllClose(rev.eval(), [[[math.log(1.)], [math.log(2.)]]])
+      self.assertAllClose(jac.eval(), [[0., -math.log(2.)]])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py b/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py
new file mode 100644
index 00000000000..8b2520f8368
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py
@@ -0,0 +1,173 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy import stats
+import tensorflow as tf
+
+
+class BinomialTest(tf.test.TestCase):
+
+  def testSimpleShapes(self):
+    with self.test_session():
+      p = np.float32(np.random.beta(1, 1))
+      binom = tf.contrib.distributions.Binomial(n=1., p=p)
+      self.assertAllEqual([], binom.event_shape().eval())
+      self.assertAllEqual([], binom.batch_shape().eval())
+      self.assertEqual(tf.TensorShape([]), binom.get_event_shape())
+      self.assertEqual(tf.TensorShape([]), binom.get_batch_shape())
+
+  def testComplexShapes(self):
+    with self.test_session():
+      p = np.random.beta(1, 1, size=(3, 2)).astype(np.float32)
+      n = [[3., 2], [4, 5], [6, 7]]
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      self.assertAllEqual([], binom.event_shape().eval())
+      self.assertAllEqual([3, 2], binom.batch_shape().eval())
+      self.assertEqual(tf.TensorShape([]), binom.get_event_shape())
+      self.assertEqual(tf.TensorShape([3, 2]), binom.get_batch_shape())
+
+  def testNProperty(self):
+    p = [[0.1, 0.2, 0.7], [0.2, 0.3, 0.5]]
+    n = [[3.], [4]]
+    with self.test_session():
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      self.assertEqual((2, 1), binom.n.get_shape())
+      self.assertAllClose(n, binom.n.eval())
+
+  def testPProperty(self):
+    p = [[0.1, 0.2, 0.7]]
+    with self.test_session():
+      binom = tf.contrib.distributions.Binomial(n=3., p=p)
+      self.assertEqual((1, 3), binom.p.get_shape())
+      self.assertEqual((1, 3), binom.logits.get_shape())
+      self.assertAllClose(p, binom.p.eval())
+
+  def testLogitsProperty(self):
+    logits = [[0., 9., -0.5]]
+    with self.test_session():
+      binom = tf.contrib.distributions.Binomial(n=3., logits=logits)
+      self.assertEqual((1, 3), binom.p.get_shape())
+      self.assertEqual((1, 3), binom.logits.get_shape())
+      self.assertAllClose(logits, binom.logits.eval())
+
+  def testPmfNandCountsAgree(self):
+    p = [[0.1, 0.2, 0.7]]
+    n = [[5.]]
+    with self.test_session():
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      binom.pmf([2., 3, 2]).eval()
+      binom.pmf([3., 1, 2]).eval()
+      with self.assertRaisesOpError('Condition x >= 0.*'):
+        binom.pmf([-1., 4, 2]).eval()
+      with self.assertRaisesOpError('Condition x <= y.*'):
+        binom.pmf([7., 3, 0]).eval()
+
+  def testPmf_non_integer_counts(self):
+    p = [[0.1, 0.2, 0.7]]
+    n = [[5.]]
+    with self.test_session():
+      # No errors with integer n.
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      binom.pmf([2., 3, 2]).eval()
+      binom.pmf([3., 1, 2]).eval()
+      # Both equality and integer checking fail.
+      with self.assertRaisesOpError('Condition x == y.*'):
+        binom.pmf([1.0, 2.5, 1.5]).eval()
+
+      binom = tf.contrib.distributions.Binomial(n=n, p=p, validate_args=False)
+      binom.pmf([1., 2., 3.]).eval()
+      # Non-integer arguments work.
+      binom.pmf([1.0, 2.5, 1.5]).eval()
+
+  def testPmfBothZeroBatches(self):
+    with self.test_session():
+      # Both zero-batches.  No broadcast
+      p = 0.5
+      counts = 1.
+      pmf = tf.contrib.distributions.Binomial(n=1., p=p).pmf(counts)
+      self.assertAllClose(0.5, pmf.eval())
+      self.assertEqual((), pmf.get_shape())
+
+  def testPmfBothZeroBatchesNontrivialN(self):
+    with self.test_session():
+      # Both zero-batches.  No broadcast
+      p = 0.1
+      counts = 3.
+      binom = tf.contrib.distributions.Binomial(n=5., p=p)
+      pmf = binom.pmf(counts)
+      self.assertAllClose(stats.binom.pmf(counts, n=5., p=p), pmf.eval())
+      self.assertEqual((), pmf.get_shape())
+
+  def testPmfPStretchedInBroadcastWhenSameRank(self):
+    with self.test_session():
+      p = [[0.1, 0.9]]
+      counts = [[1., 2.]]
+      pmf = tf.contrib.distributions.Binomial(n=3., p=p).pmf(counts)
+      self.assertAllClose(stats.binom.pmf(counts, n=3., p=p), pmf.eval())
+      self.assertEqual((1, 2), pmf.get_shape())
+
+  def testPmfPStretchedInBroadcastWhenLowerRank(self):
+    with self.test_session():
+      p = [0.1, 0.4]
+      counts = [[1.], [0.]]
+      pmf = tf.contrib.distributions.Binomial(n=1., p=p).pmf(counts)
+      self.assertAllClose([[0.1, 0.4], [0.9, 0.6]], pmf.eval())
+      self.assertEqual((2, 2), pmf.get_shape())
+
+  def testBinomialMean(self):
+    with self.test_session():
+      n = 5.
+      p = [0.1, 0.2, 0.7]
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      expected_means = stats.binom.mean(n, p)
+      self.assertEqual((3,), binom.mean().get_shape())
+      self.assertAllClose(expected_means, binom.mean().eval())
+
+  def testBinomialVariance(self):
+    with self.test_session():
+      n = 5.
+      p = [0.1, 0.2, 0.7]
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      expected_variances = stats.binom.var(n, p)
+      self.assertEqual((3,), binom.variance().get_shape())
+      self.assertAllClose(expected_variances, binom.variance().eval())
+
+  def testBinomialMode(self):
+    with self.test_session():
+      n = 5.
+      p = [0.1, 0.2, 0.7]
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      expected_modes = [0., 1, 4]
+      self.assertEqual((3,), binom.mode().get_shape())
+      self.assertAllClose(expected_modes, binom.mode().eval())
+
+  def testBinomialMultipleMode(self):
+    with self.test_session():
+      n = 9.
+      p = [0.1, 0.2, 0.7]
+      binom = tf.contrib.distributions.Binomial(n=n, p=p)
+      # For the case where (n + 1) * p is an integer, the modes are:
+      # (n + 1) * p and (n + 1) * p - 1. In this case, we get back
+      # the larger of the two modes.
+      expected_modes = [1., 2, 7]
+      self.assertEqual((3,), binom.mode().get_shape())
+      self.assertAllClose(expected_modes, binom.mode().eval())
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py b/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
index 866fb45524c..23833a246b9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
@@ -61,14 +61,14 @@ class DirichletMultinomialTest(tf.test.TestCase):
     n = [[5.]]
     with self.test_session():
       dist = tf.contrib.distributions.DirichletMultinomial(n, alpha)
-      dist.pmf([2, 3, 0]).eval()
-      dist.pmf([3, 0, 2]).eval()
+      dist.pmf([2., 3, 0]).eval()
+      dist.pmf([3., 0, 2]).eval()
       with self.assertRaisesOpError('Condition x >= 0.*'):
-        dist.pmf([-1, 4, 2]).eval()
-      with self.assertRaisesOpError('Condition x == y.*'):
-        dist.pmf([3, 3, 0]).eval()
+        dist.pmf([-1., 4, 2]).eval()
+      with self.assertRaisesOpError('counts do not sum to n'):
+        dist.pmf([3., 3, 0]).eval()
 
-  def testPmfArbitraryCounts(self):
+  def testPmf_non_integer_counts(self):
     alpha = [[1., 2, 3]]
     n = [[5.]]
     with self.test_session():
@@ -80,8 +80,10 @@ class DirichletMultinomialTest(tf.test.TestCase):
       with self.assertRaisesOpError('Condition x == y.*'):
         dist.pmf([1.0, 2.5, 1.5]).eval()
       dist = tf.contrib.distributions.DirichletMultinomial(
-          n, alpha, allow_arbitrary_counts=True)
-      dist.pmf(np.array([1.0, 2.5, 1.5])).eval()
+          n, alpha, validate_args=False)
+      dist.pmf([1., 2., 3.]).eval()
+      # Non-integer arguments work.
+      dist.pmf([1.0, 2.5, 1.5]).eval()
 
   def testPmfBothZeroBatches(self):
     # The probabilities of one vote falling into class k is the mean for class
@@ -90,7 +92,7 @@ class DirichletMultinomialTest(tf.test.TestCase):
       # Both zero-batches.  No broadcast
       alpha = [1., 2]
       counts = [1., 0]
-      dist = tf.contrib.distributions.DirichletMultinomial(1, alpha)
+      dist = tf.contrib.distributions.DirichletMultinomial(1., alpha)
       pmf = dist.pmf(counts)
       self.assertAllClose(1 / 3., pmf.eval())
       self.assertEqual((), pmf.get_shape())
@@ -102,7 +104,7 @@ class DirichletMultinomialTest(tf.test.TestCase):
       # Both zero-batches.  No broadcast
       alpha = [1., 2]
       counts = [3., 2]
-      dist = tf.contrib.distributions.DirichletMultinomial(5, alpha)
+      dist = tf.contrib.distributions.DirichletMultinomial(5., alpha)
       pmf = dist.pmf(counts)
       self.assertAllClose(1 / 7., pmf.eval())
       self.assertEqual((), pmf.get_shape())
@@ -113,7 +115,7 @@ class DirichletMultinomialTest(tf.test.TestCase):
     with self.test_session():
       alpha = [1., 2]
       counts = [3., 2]
-      n = np.full([4, 3], 5.)
+      n = np.full([4, 3], 5., dtype=np.float32)
       dist = tf.contrib.distributions.DirichletMultinomial(n, alpha)
       pmf = dist.pmf(counts)
       self.assertAllClose([[1 / 7., 1 / 7., 1 / 7.]] * 4, pmf.eval())
@@ -125,7 +127,7 @@ class DirichletMultinomialTest(tf.test.TestCase):
     with self.test_session():
       alpha = [[1., 2]]
       counts = [[1., 0], [0., 1]]
-      dist = tf.contrib.distributions.DirichletMultinomial([1], alpha)
+      dist = tf.contrib.distributions.DirichletMultinomial([1.], alpha)
       pmf = dist.pmf(counts)
       self.assertAllClose([1 / 3., 2 / 3.], pmf.eval())
       self.assertEqual((2), pmf.get_shape())
@@ -231,12 +233,12 @@ class DirichletMultinomialTest(tf.test.TestCase):
 
   def testVariance_n_alpha_broadcast(self):
     alpha_v = [1., 2, 3]
-    alpha_0 = np.sum(alpha_v)
+    alpha_0 = 6.
 
     # Shape [4, 3]
-    alpha = np.array(4 * [alpha_v])
+    alpha = np.array(4 * [alpha_v], dtype=np.float32)
     # Shape [4, 1]
-    ns = np.array([[2.], [3.], [4.], [5.]])
+    ns = np.array([[2.], [3.], [4.], [5.]], dtype=np.float32)
 
     variance_entry = lambda a, a_sum: a / a_sum * (1 - a / a_sum)
     covariance_entry = lambda a, b, a_sum: -a  * b/ a_sum**2
@@ -250,7 +252,7 @@ class DirichletMultinomialTest(tf.test.TestCase):
          covariance_entry(alpha_v[1], alpha_v[2], alpha_0)],
         [covariance_entry(alpha_v[2], alpha_v[0], alpha_0),
          covariance_entry(alpha_v[2], alpha_v[1], alpha_0),
-         variance_entry(alpha_v[2], alpha_0)]]])
+         variance_entry(alpha_v[2], alpha_0)]]], dtype=np.float32)
 
     with self.test_session():
       # ns is shape [4, 1], and alpha is shape [4, 3].
@@ -263,11 +265,11 @@ class DirichletMultinomialTest(tf.test.TestCase):
       self.assertAllClose(expected_variance, variance.eval())
 
   def testVariance_multidimensional(self):
-    alpha = np.random.rand(3, 5, 4)
-    alpha2 = np.random.rand(6, 3, 3)
-    # Ensure n > 0.
-    ns = np.random.geometric(p=0.8, size=[3, 5, 1]) + 1
-    ns2 = np.random.geometric(p=0.8, size=[6, 1, 1]) + 1
+    alpha = np.random.rand(3, 5, 4).astype(np.float32)
+    alpha2 = np.random.rand(6, 3, 3).astype(np.float32)
+
+    ns = np.random.randint(low=1, high=11, size=[3, 5, 1]).astype(np.float32)
+    ns2 = np.random.randint(low=1, high=11, size=[6, 1, 1]).astype(np.float32)
 
     with self.test_session():
       dist = tf.contrib.distributions.DirichletMultinomial(ns, alpha)
@@ -297,7 +299,7 @@ class DirichletMultinomialTest(tf.test.TestCase):
 
     # One (three sided) coin flip.  Prob[coin 3] = 0.8.
     # Note that since it was one flip, value of tau didn't matter.
-    counts = [0, 0, 1]
+    counts = [0., 0, 1]
     with self.test_session():
       dist = tf.contrib.distributions.DirichletMultinomial(1., alpha)
       pmf = dist.pmf(counts)
@@ -305,9 +307,9 @@ class DirichletMultinomialTest(tf.test.TestCase):
       self.assertEqual((), pmf.get_shape())
 
     # Two (three sided) coin flips.  Prob[coin 3] = 0.8.
-    counts = [0, 0, 2]
+    counts = [0., 0, 2]
     with self.test_session():
-      dist = tf.contrib.distributions.DirichletMultinomial(2, alpha)
+      dist = tf.contrib.distributions.DirichletMultinomial(2., alpha)
       pmf = dist.pmf(counts)
       self.assertAllClose(0.8**2, pmf.eval(), atol=1e-2)
       self.assertEqual((), pmf.get_shape())
@@ -315,7 +317,7 @@ class DirichletMultinomialTest(tf.test.TestCase):
     # Three (three sided) coin flips.
     counts = [1., 0, 2]
     with self.test_session():
-      dist = tf.contrib.distributions.DirichletMultinomial(3, alpha)
+      dist = tf.contrib.distributions.DirichletMultinomial(3., alpha)
       pmf = dist.pmf(counts)
       self.assertAllClose(3 * 0.1 * 0.8 * 0.8, pmf.eval(), atol=1e-2)
       self.assertEqual((), pmf.get_shape())
@@ -336,10 +338,10 @@ class DirichletMultinomialTest(tf.test.TestCase):
       self.assertEqual((), pmf.get_shape())
 
     # If there are two draws, it is much more likely that they are the same.
-    counts_same = [2, 0]
+    counts_same = [2., 0]
     counts_different = [1, 1.]
     with self.test_session():
-      dist = tf.contrib.distributions.DirichletMultinomial(2, alpha)
+      dist = tf.contrib.distributions.DirichletMultinomial(2., alpha)
       pmf_same = dist.pmf(counts_same)
       pmf_different = dist.pmf(counts_different)
       self.assertLess(5 * pmf_different.eval(), pmf_same.eval())
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py b/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
new file mode 100644
index 00000000000..55c7825bf3e
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
@@ -0,0 +1,226 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+
+class MultinomialTest(tf.test.TestCase):
+
+  def testSimpleShapes(self):
+    with self.test_session():
+      p = [.1, .3, .6]
+      dist = tf.contrib.distributions.Multinomial(n=1., p=p)
+      self.assertEqual(3, dist.event_shape().eval())
+      self.assertAllEqual([], dist.batch_shape().eval())
+      self.assertEqual(tf.TensorShape([3]), dist.get_event_shape())
+      self.assertEqual(tf.TensorShape([]), dist.get_batch_shape())
+
+  def testComplexShapes(self):
+    with self.test_session():
+      p = 0.5 * np.ones([3, 2, 2], dtype=np.float32)
+      n = [[3., 2], [4, 5], [6, 7]]
+      dist = tf.contrib.distributions.Multinomial(n=n, p=p)
+      self.assertEqual(2, dist.event_shape().eval())
+      self.assertAllEqual([3, 2], dist.batch_shape().eval())
+      self.assertEqual(tf.TensorShape([2]), dist.get_event_shape())
+      self.assertEqual(tf.TensorShape([3, 2]), dist.get_batch_shape())
+
+  def testNProperty(self):
+    p = [[0.1, 0.2, 0.7], [0.2, 0.3, 0.5]]
+    n = [[3.], [4]]
+    with self.test_session():
+      dist = tf.contrib.distributions.Multinomial(n=n, p=p)
+      self.assertEqual((2, 1), dist.n.get_shape())
+      self.assertAllClose(n, dist.n.eval())
+
+  def testPProperty(self):
+    p = [[0.1, 0.2, 0.7]]
+    with self.test_session():
+      dist = tf.contrib.distributions.Multinomial(n=3., p=p)
+      self.assertEqual((1, 3), dist.p.get_shape())
+      self.assertEqual((1, 3), dist.logits.get_shape())
+      self.assertAllClose(p, dist.p.eval())
+
+  def testLogitsProperty(self):
+    logits = [[0., 9., -0.5]]
+    with self.test_session():
+      multinom = tf.contrib.distributions.Multinomial(n=3., logits=logits)
+      self.assertEqual((1, 3), multinom.p.get_shape())
+      self.assertEqual((1, 3), multinom.logits.get_shape())
+      self.assertAllClose(logits, multinom.logits.eval())
+
+  def testPmfNandCountsAgree(self):
+    p = [[0.1, 0.2, 0.7]]
+    n = [[5.]]
+    with self.test_session():
+      dist = tf.contrib.distributions.Multinomial(n=n, p=p)
+      dist.pmf([2., 3, 0]).eval()
+      dist.pmf([3., 0, 2]).eval()
+      with self.assertRaisesOpError('Condition x >= 0.*'):
+        dist.pmf([-1., 4, 2]).eval()
+      with self.assertRaisesOpError('counts do not sum to n'):
+        dist.pmf([3., 3, 0]).eval()
+
+  def testPmf_non_integer_counts(self):
+    p = [[0.1, 0.2, 0.7]]
+    n = [[5.]]
+    with self.test_session():
+      # No errors with integer n.
+      multinom = tf.contrib.distributions.Multinomial(n=n, p=p)
+      multinom.pmf([2., 1, 2]).eval()
+      multinom.pmf([3., 0, 2]).eval()
+      # Counts don't sum to n.
+      with self.assertRaisesOpError('counts do not sum to n'):
+        multinom.pmf([2., 3, 2]).eval()
+      # Counts are non-integers.
+      with self.assertRaisesOpError('Condition x == y.*'):
+        multinom.pmf([1.0, 2.5, 1.5]).eval()
+
+      multinom = tf.contrib.distributions.Multinomial(
+          n=n, p=p, validate_args=False)
+      multinom.pmf([1., 2., 2.]).eval()
+      # Non-integer arguments work.
+      multinom.pmf([1.0, 2.5, 1.5]).eval()
+
+  def testPmfBothZeroBatches(self):
+    with self.test_session():
+      # Both zero-batches.  No broadcast
+      p = [0.5, 0.5]
+      counts = [1., 0]
+      pmf = tf.contrib.distributions.Multinomial(n=1., p=p).pmf(counts)
+      self.assertAllClose(0.5, pmf.eval())
+      self.assertEqual((), pmf.get_shape())
+
+  def testPmfBothZeroBatchesNontrivialN(self):
+    with self.test_session():
+      # Both zero-batches.  No broadcast
+      p = [0.1, 0.9]
+      counts = [3., 2]
+      dist = tf.contrib.distributions.Multinomial(n=5., p=p)
+      pmf = dist.pmf(counts)
+      # 5 choose 3 = 5 choose 2 = 10. 10 * (.9)^2 * (.1)^3 = 81/10000.
+      self.assertAllClose(81./10000, pmf.eval())
+      self.assertEqual((), pmf.get_shape())
+
+  def testPmfPStretchedInBroadcastWhenSameRank(self):
+    with self.test_session():
+      p = [[0.1, 0.9]]
+      counts = [[1., 0], [0, 1]]
+      pmf = tf.contrib.distributions.Multinomial(n=1., p=p).pmf(counts)
+      self.assertAllClose([0.1, 0.9], pmf.eval())
+      self.assertEqual((2), pmf.get_shape())
+
+  def testPmfPStretchedInBroadcastWhenLowerRank(self):
+    with self.test_session():
+      p = [0.1, 0.9]
+      counts = [[1., 0], [0, 1]]
+      pmf = tf.contrib.distributions.Multinomial(n=1., p=p).pmf(counts)
+      self.assertAllClose([0.1, 0.9], pmf.eval())
+      self.assertEqual((2), pmf.get_shape())
+
+  def testPmfCountsStretchedInBroadcastWhenSameRank(self):
+    with self.test_session():
+      p = [[0.1, 0.9], [0.7, 0.3]]
+      counts = [[1., 0]]
+      pmf = tf.contrib.distributions.Multinomial(n=1., p=p).pmf(counts)
+      self.assertAllClose(pmf.eval(), [0.1, 0.7])
+      self.assertEqual((2), pmf.get_shape())
+
+  def testPmfCountsStretchedInBroadcastWhenLowerRank(self):
+    with self.test_session():
+      p = [[0.1, 0.9], [0.7, 0.3]]
+      counts = [1., 0]
+      pmf = tf.contrib.distributions.Multinomial(n=1., p=p).pmf(counts)
+      self.assertAllClose(pmf.eval(), [0.1, 0.7])
+      self.assertEqual(pmf.get_shape(), (2))
+
+  def testPmfShapeCountsStretched_N(self):
+    with self.test_session():
+      # [2, 2, 2]
+      p = [[[0.1, 0.9], [0.1, 0.9]], [[0.7, 0.3], [0.7, 0.3]]]
+      # [2, 2]
+      n = [[3., 3], [3, 3]]
+      # [2]
+      counts = [2., 1]
+      pmf = tf.contrib.distributions.Multinomial(n=n, p=p).pmf(counts)
+      pmf.eval()
+      self.assertEqual(pmf.get_shape(), (2, 2))
+
+  def testPmfShapeCountsPStretched_N(self):
+    with self.test_session():
+      p = [0.1, 0.9]
+      counts = [3., 2]
+      n = np.full([4, 3], 5., dtype=np.float32)
+      pmf = tf.contrib.distributions.Multinomial(n=n, p=p).pmf(counts)
+      pmf.eval()
+      self.assertEqual((4, 3), pmf.get_shape())
+
+  def testMultinomialMean(self):
+    with self.test_session():
+      n = 5.
+      p = [0.1, 0.2, 0.7]
+      dist = tf.contrib.distributions.Multinomial(n=n, p=p)
+      expected_means = 5 * np.array(p, dtype=np.float32)
+      self.assertEqual((3,), dist.mean().get_shape())
+      self.assertAllClose(expected_means, dist.mean().eval())
+
+  def testMultinomialVariance(self):
+    with self.test_session():
+      n = 5.
+      p = [0.1, 0.2, 0.7]
+      dist = tf.contrib.distributions.Multinomial(n=n, p=p)
+      expected_variances = [
+          [9./20, -1/10, -7/20], [-1/10, 4/5, -7/10], [-7/20, -7/10, 21/20]]
+      self.assertEqual((3, 3), dist.variance().get_shape())
+      self.assertAllClose(expected_variances, dist.variance().eval())
+
+  def testMultinomialVariance_batch(self):
+    with self.test_session():
+      # Shape [2]
+      n = [5.] * 2
+      # Shape [4, 1, 2]
+      p = [[[0.1, 0.9]], [[0.1, 0.9]]] * 2
+      dist = tf.contrib.distributions.Multinomial(n=n, p=p)
+      # Shape [2, 2]
+      inner_var = [[9./20, -9/20], [-9/20, 9/20]]
+      # Shape [4, 2, 2, 2]
+      expected_variances = [[inner_var, inner_var]] * 4
+      self.assertEqual((4, 2, 2, 2), dist.variance().get_shape())
+      self.assertAllClose(expected_variances, dist.variance().eval())
+
+  def testVariance_multidimensional(self):
+    # Shape [3, 5, 4]
+    p = np.random.dirichlet([.25, .25, .25, .25], [3, 5]).astype(np.float32)
+    # Shape [6, 3, 3]
+    p2 = np.random.dirichlet([.3, .3, .4], [6, 3]).astype(np.float32)
+
+    ns = np.random.randint(low=1, high=11, size=[3, 5]).astype(np.float32)
+    ns2 = np.random.randint(low=1, high=11, size=[6, 1]).astype(np.float32)
+
+    with self.test_session():
+      dist = tf.contrib.distributions.Multinomial(ns, p)
+      dist2 = tf.contrib.distributions.Multinomial(ns2, p2)
+
+      variance = dist.variance()
+      variance2 = dist2.variance()
+      self.assertEqual((3, 5, 4, 4), variance.get_shape())
+      self.assertEqual((6, 3, 3, 3), variance2.get_shape())
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py
index d8e75e4be22..748439070c5 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_test.py
@@ -117,6 +117,61 @@ class MultivariateNormalDiagTest(tf.test.TestCase):
       self.assertAllClose(cov_mat, np.cov(samps.T), atol=0.1)
 
 
+class MultivariateNormalDiagPlusVDVTTest(tf.test.TestCase):
+  """Well tested because this is a simple override of the base class."""
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def testMean(self):
+    mu = [-1.0, 1.0]
+    diag_large = [1.0, 5.0]
+    v = [[2.0], [3.0]]
+    diag_small = [3.0]
+    with self.test_session():
+      dist = distributions.MultivariateNormalDiagPlusVDVT(
+          mu, diag_large, v, diag_small=diag_small)
+      self.assertAllEqual(mu, dist.mean().eval())
+
+  def testNonmatchingMuAndSigmaDimensionFailsStatic(self):
+    mu = self._rng.rand(2)
+    # With this diag_large and v, the covariance is 3 x 3
+    diag_large = self._rng.rand(3)
+    v = self._rng.rand(3, 2)  # v works with diag_large.
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, "shape.*should match"):
+        distributions.MultivariateNormalDiagPlusVDVT(
+            mu, diag_large, v)
+
+  def testNonmatchingMuDiagDimensionsFailsDynamic(self):
+    mu = self._rng.rand(2)
+    # With this diag_large and v, the covariance is 3 x 3
+    diag_large = self._rng.rand(3)
+    v = self._rng.rand(3, 2)  # v works with diag_large.
+
+    with self.test_session():
+      mu_ph = tf.placeholder(tf.float32, name="mu_ph")
+      v_ph = tf.placeholder(tf.float32, name="v_ph")
+      diag_ph = tf.placeholder(tf.float32, name="diag_ph")
+      dist = distributions.MultivariateNormalDiagPlusVDVT(
+          mu_ph, diag_ph, v_ph)
+      with self.assertRaisesOpError("mu.*cov.*shape"):
+        dist.mean().eval(feed_dict={mu_ph: mu, diag_ph: diag_large, v_ph: v})
+
+  def testSample(self):
+    mu = [-1.0, 1.0]
+    diag_large = [1.0, 0.5]
+    v = [[0.2], [0.3]]
+    with self.test_session():
+      dist = distributions.MultivariateNormalDiagPlusVDVT(mu, diag_large, v)
+
+      samps = dist.sample_n(1000, seed=0).eval()
+      cov_mat = dist.sigma.eval()
+
+      self.assertAllClose(mu, samps.mean(axis=0), atol=0.1)
+      self.assertAllClose(cov_mat, np.cov(samps.T), atol=0.1)
+
+
 class MultivariateNormalCholeskyTest(tf.test.TestCase):
 
   def setUp(self):
@@ -314,5 +369,87 @@ class MultivariateNormalCholeskyTest(tf.test.TestCase):
       self.assertEqual((3, 5), tuple(mvn.batch_shape().eval()))
 
 
+class MultivariateNormalFullTest(tf.test.TestCase):
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def _random_mu_and_sigma(self, batch_shape, event_shape):
+    # This ensures sigma is positive def.
+    mat_shape = batch_shape + event_shape + event_shape
+    mat = self._rng.randn(*mat_shape)
+    sigma = tf.batch_matmul(mat, mat, adj_y=True).eval()
+
+    mu_shape = batch_shape + event_shape
+    mu = self._rng.randn(*mu_shape)
+
+    return mu, sigma
+
+  def testKLNonBatch(self):
+    batch_shape = ()
+    event_shape = (2,)
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
+      mvn_a = distributions.MultivariateNormalFull(mu_a, sigma_a)
+      mvn_b = distributions.MultivariateNormalFull(mu_b, sigma_b)
+
+      kl = distributions.kl(mvn_a, mvn_b)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      expected_kl = _compute_non_batch_kl(mu_a, sigma_a, mu_b, sigma_b)
+      self.assertAllClose(expected_kl, kl_v)
+
+  def testKLBatch(self):
+    batch_shape = (2,)
+    event_shape = (3,)
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
+      mvn_a = distributions.MultivariateNormalFull(mu_a, sigma_a)
+      mvn_b = distributions.MultivariateNormalFull(mu_b, sigma_b)
+
+      kl = distributions.kl(mvn_a, mvn_b)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      expected_kl_0 = _compute_non_batch_kl(
+          mu_a[0, :], sigma_a[0, :, :], mu_b[0, :], sigma_b[0, :])
+      expected_kl_1 = _compute_non_batch_kl(
+          mu_a[1, :], sigma_a[1, :, :], mu_b[1, :], sigma_b[1, :])
+      self.assertAllClose(expected_kl_0, kl_v[0])
+      self.assertAllClose(expected_kl_1, kl_v[1])
+
+  def testKLTwoIdenticalDistributionsIsZero(self):
+    batch_shape = (2,)
+    event_shape = (3,)
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      mvn_a = distributions.MultivariateNormalFull(mu_a, sigma_a)
+
+      # Should be zero since KL(p || p) = =.
+      kl = distributions.kl(mvn_a, mvn_a)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      self.assertAllClose(np.zeros(*batch_shape), kl_v)
+
+
+def _compute_non_batch_kl(mu_a, sigma_a, mu_b, sigma_b):
+  """Non-batch KL for N(mu_a, sigma_a), N(mu_b, sigma_b)."""
+  # Check using numpy operations
+  # This mostly repeats the tensorflow code _kl_mvn_mvn(), but in numpy.
+  # So it is important to also check that KL(mvn, mvn) = 0.
+  sigma_b_inv = np.linalg.inv(sigma_b)
+
+  t = np.trace(sigma_b_inv.dot(sigma_a))
+  q = (mu_b - mu_a).dot(sigma_b_inv).dot(mu_b - mu_a)
+  k = mu_a.shape[0]
+  l = np.log(np.linalg.det(sigma_b) / np.linalg.det(sigma_a))
+
+  return 0.5 * (t + q - k + l)
+
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_diag_test.py
index 3a0f6e1d5a7..c11b0357e73 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_diag_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_diag_test.py
@@ -17,14 +17,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 import numpy as np
+import six
 import tensorflow as tf
 
 from tensorflow.contrib.distributions.python.ops import operator_pd_diag
 from tensorflow.contrib.distributions.python.ops import operator_test_util
 
 
-class OperatorPDSqrtDiagTest(operator_test_util.OperatorPDDerivedClassTest):
+@six.add_metaclass(abc.ABCMeta)
+class OperatorPDDiagBaseTest(object):
 
   def setUp(self):
     self._rng = np.random.RandomState(42)
@@ -32,8 +35,14 @@ class OperatorPDSqrtDiagTest(operator_test_util.OperatorPDDerivedClassTest):
   def _random_pd_diag(self, diag_shape):
     return self._rng.rand(*diag_shape) + 0.1
 
+  @abc.abstractmethod
   def _diag_to_matrix(self, diag):
-    return tf.batch_matrix_diag(diag**2).eval()
+    pass
+
+  @abc.abstractproperty
+  def operator_class(self):
+    # Return the operator class that this tests.
+    pass
 
   def _build_operator_and_mat(self, batch_shape, k, dtype=np.float64):
     # Create a diagonal matrix explicitly.
@@ -46,7 +55,7 @@ class OperatorPDSqrtDiagTest(operator_test_util.OperatorPDDerivedClassTest):
     # The diag is the square root.
     diag = self._random_pd_diag(diag_shape).astype(dtype)
     mat = self._diag_to_matrix(diag).astype(dtype)
-    operator = operator_pd_diag.OperatorPDSqrtDiag(diag)
+    operator = self.operator_class(diag)
 
     return operator, mat
 
@@ -66,5 +75,29 @@ class OperatorPDSqrtDiagTest(operator_test_util.OperatorPDDerivedClassTest):
       operator.to_dense().eval()  # Should not raise
 
 
+class OperatorPDDiagTest(
+    OperatorPDDiagBaseTest, operator_test_util.OperatorPDDerivedClassTest):
+  """Most tests done in the base classes."""
+
+  def _diag_to_matrix(self, diag):
+    return tf.batch_matrix_diag(diag).eval()
+
+  @property
+  def operator_class(self):
+    return operator_pd_diag.OperatorPDDiag
+
+
+class OperatorPDSqrtDiagTest(
+    OperatorPDDiagBaseTest, operator_test_util.OperatorPDDerivedClassTest):
+  """Most tests done in the base classes."""
+
+  def _diag_to_matrix(self, diag):
+    return tf.batch_matrix_diag(diag**2).eval()
+
+  @property
+  def operator_class(self):
+    return operator_pd_diag.OperatorPDSqrtDiag
+
+
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_identity_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_identity_test.py
new file mode 100644
index 00000000000..7f411105fb0
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_identity_test.py
@@ -0,0 +1,115 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.contrib.distributions.python.ops import operator_pd_identity
+from tensorflow.contrib.distributions.python.ops import operator_test_util
+
+distributions = tf.contrib.distributions
+
+
+class OperatorPDIdentityTest(operator_test_util.OperatorPDDerivedClassTest):
+  """Most tests done in the base class."""
+
+  def _build_operator_and_mat(self, batch_shape, k, dtype=np.float64):
+    # Build an identity matrix with right shape and dtype.
+    # Build an operator that should act the same way.
+    batch_shape = list(batch_shape)
+    diag_shape = batch_shape + [k]
+    matrix_shape = batch_shape + [k, k]
+    diag = tf.ones(diag_shape, dtype=dtype)
+    identity_matrix = tf.batch_matrix_diag(diag)
+    operator = operator_pd_identity.OperatorPDIdentity(matrix_shape, dtype)
+    return operator, identity_matrix.eval()
+
+  def test_bad_dtype_args_raise(self):
+    dtype = np.float32
+    batch_shape = [2, 3]
+    k = 4
+    with self.test_session():
+      operator, _ = self._build_operator_and_mat(batch_shape, k, dtype=dtype)
+
+      x_good_shape = batch_shape + [k, 5]
+      x_good = self._rng.randn(*x_good_shape).astype(dtype)
+      x_bad = x_good.astype(np.float64)
+
+      operator.matmul(x_good).eval()  # Should not raise.
+
+      with self.assertRaisesRegexp(TypeError, 'dtype'):
+        operator.matmul(x_bad)
+
+      with self.assertRaisesRegexp(TypeError, 'dtype'):
+        operator.solve(x_bad)
+
+      with self.assertRaisesRegexp(TypeError, 'dtype'):
+        operator.sqrt_solve(x_bad)
+
+  def test_bad_rank_args_raise(self):
+    # Prepend a singleton dimension, changing the rank of 'x', but not the size.
+    dtype = np.float32
+    batch_shape = [2, 3]
+    k = 4
+    with self.test_session():
+      operator, _ = self._build_operator_and_mat(batch_shape, k, dtype=dtype)
+
+      x_good_shape = batch_shape + [k, 5]
+      x_good = self._rng.randn(*x_good_shape).astype(dtype)
+      x_bad = x_good.reshape(1, 2, 3, 4, 5)
+
+      operator.matmul(x_good).eval()  # Should not raise.
+
+      with self.assertRaisesRegexp(ValueError, 'tensor rank'):
+        operator.matmul(x_bad)
+
+      with self.assertRaisesRegexp(ValueError, 'tensor rank'):
+        operator.solve(x_bad)
+
+      with self.assertRaisesRegexp(ValueError, 'tensor rank'):
+        operator.sqrt_solve(x_bad)
+
+  def test_incompatible_shape_args_raise(self):
+    # Test shapes that are the same rank but incompatible for matrix
+    # multiplication.
+    dtype = np.float32
+    batch_shape = [2, 3]
+    k = 4
+    with self.test_session():
+      operator, _ = self._build_operator_and_mat(batch_shape, k, dtype=dtype)
+
+      x_good_shape = batch_shape + [k, 5]
+      x_good = self._rng.randn(*x_good_shape).astype(dtype)
+      x_bad_shape = batch_shape + [5, k]
+      x_bad = x_good.reshape(*x_bad_shape)
+
+      operator.matmul(x_good).eval()  # Should not raise.
+
+      with self.assertRaisesRegexp(ValueError, 'Incompatible'):
+        operator.matmul(x_bad)
+
+      with self.assertRaisesRegexp(ValueError, 'Incompatible'):
+        operator.solve(x_bad)
+
+      with self.assertRaisesRegexp(ValueError, 'Incompatible'):
+        operator.sqrt_solve(x_bad)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_vdvt_update_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_vdvt_update_test.py
new file mode 100644
index 00000000000..66d54561b52
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_vdvt_update_test.py
@@ -0,0 +1,273 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.contrib.distributions.python.ops import operator_pd_full
+from tensorflow.contrib.distributions.python.ops import operator_pd_vdvt_update
+from tensorflow.contrib.distributions.python.ops import operator_test_util
+
+distributions = tf.contrib.distributions
+
+
+class OperatorPDSqrtVDVTUpdateTest(
+    operator_test_util.OperatorPDDerivedClassTest):
+  """Most tests done in the base class."""
+  _diag_is_none = False
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def _random_pd_matrix(self, shape):
+    # With probability 1 this is positive definite.
+    sqrt = self._rng.randn(*shape)
+    mat = tf.batch_matmul(sqrt, sqrt, adj_y=True)
+    return mat.eval()
+
+  def _random_v_and_diag(self, mat_shape, v_matrix_rank):
+    # Get the necessary elements to make the sqrt update.
+    mat_shape = list(mat_shape)
+    batch_shape = mat_shape[:-2]
+    diag_shape = mat_shape[:-2] + [v_matrix_rank]
+    k = mat_shape[-1]
+    assert k == mat_shape[-2], 'Must be a square matrix'
+    v_shape = batch_shape + [k, v_matrix_rank]
+    v = self._rng.randn(*v_shape)  # anything goes with "v"!
+
+    if self._diag_is_none:
+      diag = None
+    else:
+      diag = self._rng.rand(*diag_shape) + 0.1  # Positive diag!
+    return v, diag
+
+  def _updated_mat(self, mat, v, diag):
+    # Get dense matrix defined by its square root, which is an update of `mat`:
+    # A = (mat + v D v^T) (mat + v D v^T)^T
+    # D is the diagonal matrix with `diag` on the diagonal.
+
+    # If diag is None, then it defaults to the identity matrix, so DV^T = V^T
+    if diag is None:
+      diag_vt = tf.batch_matrix_transpose(v)
+    else:
+      diag_mat = tf.batch_matrix_diag(diag)
+      diag_vt = tf.batch_matmul(diag_mat, v, adj_y=True)
+
+    v_diag_vt = tf.batch_matmul(v, diag_vt)
+    sqrt = mat + v_diag_vt
+    a = tf.batch_matmul(sqrt, sqrt, adj_y=True)
+    return a.eval()
+
+  def _build_operator_and_mat(self, batch_shape, k, dtype=np.float64):
+    """This method is called by base class, enabling many standard tests."""
+    # Create a matrix then explicitly update it with v and diag.
+    # Create an OperatorPDSqrtVDVTUpdate from the matrix and v and diag
+    # The operator should have the same behavior.
+    #
+    # The low-rank matrix V will have rank 1/2 of k, unless k is 1, in which
+    # case it will be 1 as well.
+    if k == 1:
+      v_matrix_rank = k
+    else:
+      v_matrix_rank = k // 2
+    mat_shape = list(batch_shape) + [k, k]
+    mat = self._random_pd_matrix(mat_shape)
+    v, diag = self._random_v_and_diag(mat_shape, v_matrix_rank)
+
+    # Set dtypes
+    mat = mat.astype(dtype)
+    v = v.astype(dtype)
+    if diag is not None:
+      diag = diag.astype(dtype)
+
+    # The matrix: (mat + v*diag*v^T) * (mat + v*diag*v^T)^T
+    # Our final updated operator should behave like this.
+    updated_mat = self._updated_mat(mat, v, diag)
+
+    # Represents the matrix: `mat`, before updating.
+    # This is the Operator that we will update.
+    o_made_with_mat = operator_pd_full.OperatorPDFull(mat)
+
+    # Represents the matrix: (mat + v*diag*v^T) * (mat + v*diag*v^T)^T,
+    # achieved by updating the operator "o_made_with_mat".
+    # This is the operator we're testing.
+    operator = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
+        o_made_with_mat, v, diag)
+
+    return operator, updated_mat
+
+  def test_to_dense_placeholder(self):
+    # Test simple functionality when the inputs are placeholders.
+    mat_shape = [3, 3]
+    v_matrix_rank = 2
+    with self.test_session():
+      # Make an OperatorPDFull with a matrix placeholder.
+      mat_ph = tf.placeholder(tf.float64, name='mat_ph')
+      mat = self._random_pd_matrix(mat_shape)
+      o_made_with_mat = operator_pd_full.OperatorPDFull(mat_ph)
+
+      # Make the placeholders and arrays for the updated operator.
+      v_ph = tf.placeholder(tf.float64, name='v_ph')
+      v, diag = self._random_v_and_diag(mat_shape, v_matrix_rank)
+      if self._diag_is_none:
+        diag_ph = None
+        feed_dict = {v_ph: v, mat_ph: mat}
+      else:
+        diag_ph = tf.placeholder(tf.float64, name='diag_ph')
+        feed_dict = {v_ph: v, diag_ph: diag, mat_ph: mat}
+
+      # Make the OperatorPDSqrtVDVTUpdate with v and diag placeholders.
+      operator = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
+          o_made_with_mat, v_ph, diag=diag_ph)
+
+      # Should not fail
+      operator.to_dense().eval(feed_dict=feed_dict)
+      operator.log_det().eval(feed_dict=feed_dict)
+
+  def test_operator_not_subclass_of_operator_pd_raises(self):
+    # We enforce that `operator` is an `OperatorPDBase`.
+    with self.test_session():
+      v, diag = self._random_v_and_diag((3, 3), 2)
+      operator_m = 'I am not a subclass of OperatorPDBase'
+
+      with self.assertRaisesRegexp(TypeError, 'not instance'):
+        operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(operator_m, v, diag)
+
+  def test_non_pos_def_diag_raises(self):
+    if self._diag_is_none:
+      return
+    # We enforce that the diag is positive definite.
+    with self.test_session():
+      matrix_shape = (3, 3)
+      v_rank = 2
+      v, diag = self._random_v_and_diag(matrix_shape, v_rank)
+      mat = self._random_pd_matrix(matrix_shape)
+      diag[0] = 0.0
+
+      operator_m = operator_pd_full.OperatorPDFull(mat)
+      operator = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
+          operator_m, v, diag)
+
+      with self.assertRaisesOpError('positive'):
+        operator.to_dense().eval()
+
+  def test_non_pos_def_diag_doesnt_raise_if_verify_pd_false(self):
+    # We enforce that the diag is positive definite.
+    if self._diag_is_none:
+      return
+    with self.test_session():
+      matrix_shape = (3, 3)
+      v_rank = 2
+      v, diag = self._random_v_and_diag(matrix_shape, v_rank)
+      mat = self._random_pd_matrix(matrix_shape)
+      diag[0] = 0.0
+
+      operator_m = operator_pd_full.OperatorPDFull(mat)
+      operator = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
+          operator_m, v, diag, verify_pd=False)
+
+      operator.to_dense().eval()  # Should not raise.
+
+  def test_event_shape_mismatch_v_and_diag_raises_static(self):
+    v = self._rng.rand(4, 3, 2)
+    diag = self._rng.rand(4, 1)  # Should be shape (4, 2,) to match v.
+    with self.test_session():
+
+      mat = self._random_pd_matrix((4, 3, 3))  # mat and v match
+      operator_m = operator_pd_full.OperatorPDFull(mat)
+      with self.assertRaisesRegexp(ValueError, 'diag.*v.*last dimension'):
+        operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(operator_m, v, diag)
+
+  def test_batch_shape_mismatch_v_and_diag_raises_static(self):
+    v = self._rng.rand(4, 3, 2)
+    diag = self._rng.rand(5, 1)  # Should be shape (4, 2,) to match v.
+    with self.test_session():
+
+      mat = self._random_pd_matrix((4, 3, 3))  # mat and v match
+      operator_m = operator_pd_full.OperatorPDFull(mat)
+      with self.assertRaisesRegexp(ValueError, 'diag.*batch shape'):
+        operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(operator_m, v, diag)
+
+  def test_tensor_rank_shape_mismatch_v_and_diag_raises_static(self):
+    v = self._rng.rand(1, 2, 2, 2)
+    diag = self._rng.rand(5, 1)  # Should have rank 1 less than v.
+    with self.test_session():
+
+      mat = self._random_pd_matrix((1, 2, 2, 2))  # mat and v match
+      operator_m = operator_pd_full.OperatorPDFull(mat)
+      with self.assertRaisesRegexp(ValueError, 'diag.*rank'):
+        operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(operator_m, v, diag)
+
+  def test_event_shape_mismatch_v_and_diag_raises_dynamic(self):
+    with self.test_session():
+
+      v = self._rng.rand(4, 3, 2)
+      diag = self._rng.rand(4, 1)  # Should be shape (4, 2,) to match v.
+      mat = self._random_pd_matrix((4, 3, 3))  # mat and v match
+
+      v_ph = tf.placeholder(tf.float32, name='v_ph')
+      diag_ph = tf.placeholder(tf.float32, name='diag_ph')
+      mat_ph = tf.placeholder(tf.float32, name='mat_ph')
+
+      operator_m = operator_pd_full.OperatorPDFull(mat_ph)
+      updated = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
+          operator_m, v_ph, diag_ph)
+      with self.assertRaisesOpError('x == y'):
+        updated.to_dense().eval(feed_dict={v_ph: v, diag_ph: diag, mat_ph: mat})
+
+  def test_batch_shape_mismatch_v_and_diag_raises_dynamic(self):
+    with self.test_session():
+      v = self._rng.rand(4, 3, 2)
+      diag = self._rng.rand(5, 1)  # Should be shape (4, 2,) to match v.
+      mat = self._random_pd_matrix((4, 3, 3))  # mat and v match
+
+      v_ph = tf.placeholder(tf.float32, name='v_ph')
+      diag_ph = tf.placeholder(tf.float32, name='diag_ph')
+      mat_ph = tf.placeholder(tf.float32, name='mat_ph')
+
+      operator_m = operator_pd_full.OperatorPDFull(mat_ph)
+      updated = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
+          operator_m, v_ph, diag_ph)
+      with self.assertRaisesOpError('x == y'):
+        updated.to_dense().eval(feed_dict={v_ph: v, diag_ph: diag, mat_ph: mat})
+
+  def test_tensor_rank_shape_mismatch_v_and_diag_raises_dynamic(self):
+    with self.test_session():
+
+      v = self._rng.rand(2, 2, 2, 2)
+      diag = self._rng.rand(2, 2)  # Should have rank 1 less than v.
+      mat = self._random_pd_matrix((2, 2, 2, 2))  # mat and v match
+
+      v_ph = tf.placeholder(tf.float32, name='v_ph')
+      diag_ph = tf.placeholder(tf.float32, name='diag_ph')
+      mat_ph = tf.placeholder(tf.float32, name='mat_ph')
+
+      operator_m = operator_pd_full.OperatorPDFull(mat_ph)
+      updated = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
+          operator_m, v_ph, diag_ph)
+      with self.assertRaisesOpError('rank'):
+        updated.to_dense().eval(feed_dict={v_ph: v, diag_ph: diag, mat_ph: mat})
+
+
+class OperatorPDSqrtVDVTUpdateNoneDiagTest(OperatorPDSqrtVDVTUpdateTest):
+  _diag_is_none = True
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
new file mode 100644
index 00000000000..351c69c747f
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/shape_test.py
@@ -0,0 +1,165 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ShapeUtil."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib.distributions.python.ops.shape import _ShapeUtil  # pylint: disable=line-too-long
+
+
+class ShapeUtilTest(tf.test.TestCase):
+
+  def testShapeUtilGetNdims(self):
+    with self.test_session():
+      shaper = _ShapeUtil(batch_ndims=0, event_ndims=0)
+      x = 1
+      self.assertEqual(shaper.get_sample_ndims(x), 0)
+      self.assertEqual(shaper.batch_ndims, 0)
+      self.assertEqual(shaper.event_ndims, 0)
+
+      shaper = _ShapeUtil(batch_ndims=1, event_ndims=1)
+      x = [[[0., 1, 2], [3, 4, 5]]]
+      self.assertAllEqual(shaper.get_ndims(x), 3)
+      self.assertEqual(shaper.get_sample_ndims(x), 1)
+      self.assertEqual(shaper.batch_ndims, 1)
+      self.assertEqual(shaper.event_ndims, 1)
+
+      x += [[[6, 7, 8], [9, 10, 11]]]
+      self.assertAllEqual(shaper.get_ndims(x), 3)
+      self.assertEqual(shaper.get_sample_ndims(x), 1)
+      self.assertEqual(shaper.batch_ndims, 1)
+      self.assertEqual(shaper.event_ndims, 1)
+
+      # Test ndims functions work, even despite unfed Tensors.
+      y = tf.placeholder(tf.float32, shape=(1024, None, 1024))
+      self.assertAllEqual(shaper.get_ndims(y), 3)
+      self.assertEqual(shaper.get_sample_ndims(y), 1)
+      self.assertEqual(shaper.batch_ndims, 1)
+      self.assertEqual(shaper.event_ndims, 1)
+
+      with self.assertRaises(ValueError):
+        y = tf.placeholder(tf.float32)
+        shaper.get_ndims(y)
+
+  def testShapeUtilGetDims(self):
+    with self.test_session():
+      shaper = _ShapeUtil(batch_ndims=0, event_ndims=0)
+      with self.assertRaises(ValueError):
+        y = tf.placeholder(tf.float32)
+        shaper.get_sample_dims(y)
+      with self.assertRaises(ValueError):
+        y = tf.placeholder(tf.float32)
+        shaper.get_batch_dims(y)
+      with self.assertRaises(ValueError):
+        y = tf.placeholder(tf.float32)
+        shaper.get_event_dims(y)
+
+      shaper = _ShapeUtil(batch_ndims=0, event_ndims=0)
+      x = 1
+      self.assertAllEqual(shaper.get_sample_dims(x), [])
+      self.assertAllEqual(shaper.get_batch_dims(x), [])
+      self.assertAllEqual(shaper.get_event_dims(x), [])
+      self.assertAllEqual(shaper.get_dims(x, sample=False), [])
+
+      shaper = _ShapeUtil(batch_ndims=1, event_ndims=2)
+      x = [[[[0., 1], [2, 4]]]]
+      self.assertAllEqual(shaper.get_sample_dims(x), [0])
+      self.assertAllEqual(shaper.get_batch_dims(x), [1])
+      self.assertAllEqual(shaper.get_event_dims(x), [2, 3])
+      self.assertAllEqual(shaper.get_dims(x, sample=False), [1, 2, 3])
+
+      x += x
+      self.assertAllEqual(shaper.get_sample_dims(x), [0])
+      self.assertAllEqual(shaper.get_batch_dims(x), [1])
+      self.assertAllEqual(shaper.get_event_dims(x), [2, 3])
+      self.assertAllEqual(shaper.get_dims(x, sample=False), [1, 2, 3])
+
+      # Test dims functions work, despite unfed Tensors.
+      y = tf.placeholder(tf.float32, shape=(1024, None, 5, 5))
+      self.assertAllEqual(shaper.get_sample_dims(y), [0])
+      self.assertAllEqual(shaper.get_batch_dims(y), [1])
+      self.assertAllEqual(shaper.get_event_dims(y), [2, 3])
+
+  def testShapeUtilGetShape(self):
+    with self.test_session() as sess:
+      shaper = _ShapeUtil(batch_ndims=0, event_ndims=0)
+      with self.assertRaises(ValueError):
+        y = tf.placeholder(tf.float32)
+        shaper.get_sample_shape(y)
+      with self.assertRaises(ValueError):
+        y = tf.placeholder(tf.float32)
+        shaper.get_batch_shape(y)
+      with self.assertRaises(ValueError):
+        y = tf.placeholder(tf.float32)
+        shaper.get_event_shape(y)
+
+      shaper = _ShapeUtil(batch_ndims=0, event_ndims=0)
+      x = 1
+      self.assertAllEqual(shaper.get_sample_shape(x), [])
+      self.assertAllEqual(shaper.get_batch_shape(x), [])
+      self.assertAllEqual(shaper.get_event_shape(x), [])
+      self.assertAllEqual(shaper.get_shape(x, batch=False), [])
+
+      shaper = _ShapeUtil(batch_ndims=1, event_ndims=1)
+      x = [[[0., 1, 2], [3, 4, 5]]]
+      self.assertAllEqual(shaper.get_sample_shape(x), [1])
+      self.assertAllEqual(shaper.get_batch_shape(x), [2])
+      self.assertAllEqual(shaper.get_event_shape(x), [3])
+      self.assertAllEqual(shaper.get_shape(x, batch=False), [1, 3])
+
+      x += [[[6, 7, 8], [9, 10, 11]]]
+      self.assertAllEqual(shaper.get_sample_shape(x), [2])
+      self.assertAllEqual(shaper.get_batch_shape(x), [2])
+      self.assertAllEqual(shaper.get_event_shape(x), [3])
+      self.assertAllEqual(shaper.get_shape(x, batch=False), [2, 3])
+
+      shaper = _ShapeUtil(batch_ndims=0, event_ndims=1)
+      x = tf.ones((3, 2))
+      self.assertAllEqual(shaper.get_shape(x, sample=False), (2,))
+
+      def feed_eval(fun, build_shape=(None, None, 2), graph_shape=(3, 4, 2)):
+        """Helper to use a deferred-shape tensor eval'ed at graph runtime."""
+        y = tf.placeholder(tf.int32, shape=build_shape)
+        y_value = np.ones(graph_shape, dtype=y.dtype.as_numpy_dtype())
+        return sess.run(fun(y),
+                        feed_dict={y: y_value})
+
+      shaper = _ShapeUtil(batch_ndims=1, event_ndims=1)
+      self.assertAllEqual(feed_eval(shaper.get_sample_shape), [3])
+      self.assertAllEqual(feed_eval(shaper.get_batch_shape), [4])
+      self.assertAllEqual(feed_eval(shaper.get_event_shape), [2])
+      self.assertAllEqual(
+          feed_eval(lambda y: shaper.get_shape(y, batch=False)),
+          [3, 2])
+
+      shaper = _ShapeUtil(batch_ndims=0, event_ndims=1)
+      self.assertAllEqual(
+          feed_eval(lambda y: shaper.get_shape(y, batch=False),
+                    (None, None),
+                    (3, 2)),
+          [3, 2])
+      self.assertAllEqual(
+          feed_eval(lambda y: shaper.get_shape(y, sample=False),
+                    (None, None),
+                    (3, 2)),
+          [2])
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bernoulli.py b/tensorflow/contrib/distributions/python/ops/bernoulli.py
index b3259b2867d..1db599b3fea 100644
--- a/tensorflow/contrib/distributions/python/ops/bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/bernoulli.py
@@ -19,13 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import distribution
+from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import kullback_leibler  # pylint: disable=line-too-long
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
@@ -36,10 +36,6 @@ class Bernoulli(distribution.Distribution):
 
   The Bernoulli distribution is parameterized by p, the probability of a
   positive event.
-
-  Note, the following methods of the base class aren't implemented:
-    * cdf
-    * log_cdf
   """
 
   def __init__(self,
@@ -62,10 +58,10 @@ class Bernoulli(distribution.Distribution):
       dtype: dtype for samples.
       validate_args: Whether to assert that `0 <= p <= 1`. If not validate_args,
        `log_pmf` may return nans.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: A name for this distribution.
 
     Raises:
@@ -75,25 +71,8 @@ class Bernoulli(distribution.Distribution):
     self._name = name
     self._dtype = dtype
     self._validate_args = validate_args
-    check_op = check_ops.assert_less_equal
-    if p is None and logits is None:
-      raise ValueError("Must pass p or logits.")
-    elif p is not None and logits is not None:
-      raise ValueError("Must pass either p or logits, not both.")
-    elif p is None:
-      with ops.op_scope([logits], name):
-        self._logits = array_ops.identity(logits, name="logits")
-      with ops.name_scope(name):
-        with ops.name_scope("p"):
-          self._p = math_ops.sigmoid(self._logits)
-    elif logits is None:
-      with ops.name_scope(name):
-        with ops.name_scope("p"):
-          with ops.control_dependencies([check_op(p, 1.), check_op(0., p)] if
-                                        validate_args else []):
-            self._p = array_ops.identity(p)
-        with ops.name_scope("logits"):
-          self._logits = math_ops.log(self._p) - math_ops.log(1. - self._p)
+    self._logits, self._p = distribution_util.get_logits_and_prob(
+        name=name, logits=logits, p=p, validate_args=validate_args)
     with ops.name_scope(name):
       with ops.name_scope("q"):
         self._q = 1. - self._p
@@ -180,8 +159,12 @@ class Bernoulli(distribution.Distribution):
         event = ops.convert_to_tensor(event, name="event")
         event = math_ops.cast(event, self.logits.dtype)
         logits = self.logits
-        if ((event.get_shape().ndims is not None) or
-            (logits.get_shape().ndims is not None) or
+        # sigmoid_cross_entropy_with_logits doesn't broadcast shape,
+        # so we do this here.
+        # TODO(b/30637701): Check dynamic shape, and don't broadcast if the
+        # dynamic shapes are the same.
+        if (not event.get_shape().is_fully_defined() or
+            not logits.get_shape().is_fully_defined() or
             event.get_shape() != logits.get_shape()):
           logits = array_ops.ones_like(event) * logits
           event = array_ops.ones_like(logits) * event
@@ -202,8 +185,7 @@ class Bernoulli(distribution.Distribution):
     with ops.name_scope(self.name):
       with ops.op_scope([self.p, n], name):
         n = ops.convert_to_tensor(n, name="n")
-        new_shape = array_ops.concat(
-            0, [array_ops.expand_dims(n, 0), self.batch_shape()])
+        new_shape = array_ops.concat(0, ([n], self.batch_shape()))
         uniform = random_ops.random_uniform(
             new_shape, seed=seed, dtype=dtypes.float32)
         sample = math_ops.less(uniform, self.p)
diff --git a/tensorflow/contrib/distributions/python/ops/beta.py b/tensorflow/contrib/distributions/python/ops/beta.py
index 2bd64180682..fcf4a9056c3 100644
--- a/tensorflow/contrib/distributions/python/ops/beta.py
+++ b/tensorflow/contrib/distributions/python/ops/beta.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """The Beta distribution class."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -95,6 +96,7 @@ class Beta(distribution.Distribution):
   x = [.2, .3, .9]
   dist.pdf(x)  # Shape [2]
   ```
+
   """
 
   def __init__(self, a, b, validate_args=True, allow_nan_stats=False,
@@ -102,20 +104,20 @@ class Beta(distribution.Distribution):
     """Initialize a batch of Beta distributions.
 
     Args:
-      a:  Positive `float` or `double` tensor with shape broadcastable to
+      a:  Positive floating point tensor with shape broadcastable to
         `[N1,..., Nm]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
          different Beta distributions. This also defines the
          dtype of the distribution.
-      b:  Positive `float` or `double` tensor with shape broadcastable to
+      b:  Positive floating point tensor with shape broadcastable to
         `[N1,..., Nm]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
          different Beta distributions.
       validate_args: Whether to assert valid values for parameters `a` and `b`,
-        and `x` in `prob` and `log_prob`.  If False, correct behavior is not
+        and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
         guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prefix Ops created by this distribution class.
 
     Examples:
@@ -127,6 +129,7 @@ class Beta(distribution.Distribution):
     # Define a 2-batch.
     dist = Beta([1.0, 2.0], [4.0, 5.0])
     ```
+
     """
     with ops.op_scope([a, b], name):
       with ops.control_dependencies([
@@ -276,8 +279,14 @@ class Beta(distribution.Distribution):
                array_ops.ones_like(a_b_sum, dtype=self.dtype)))
         else:
           return control_flow_ops.with_dependencies([
-              check_ops.assert_less(one, a),
-              check_ops.assert_less(one, b)], mode)
+              check_ops.assert_less(
+                  one, a,
+                  message="mode not defined for components of a <= 1"
+              ),
+              check_ops.assert_less(
+                  one, b,
+                  message="mode not defined for components of b <= 1"
+              )], mode)
 
   def entropy(self, name="entropy"):
     """Entropy of the distribution in nats."""
@@ -306,7 +315,7 @@ class Beta(distribution.Distribution):
     """`Log(P[counts])`, computed for every batch member.
 
     Args:
-      x:  Non-negative `float` or `double`, tensor whose shape can
+      x:  Non-negative floating point tensor whose shape can
         be broadcast with `self.a` and `self.b`.  For fixed leading
         dimensions, the last dimension represents counts for the corresponding
         Beta distribution in `self.a` and `self.b`. `x` is only legal if
@@ -334,7 +343,7 @@ class Beta(distribution.Distribution):
     """`P[x]`, computed for every batch member.
 
     Args:
-      x:  Non-negative `float`, `double` tensor whose shape can
+      x:  Non-negative floating point tensor whose shape can
         be broadcast with `self.a` and `self.b`.  For fixed leading
         dimensions, the last dimension represents x for the corresponding Beta
         distribution in `self.a` and `self.b`. `x` is only legal if is
diff --git a/tensorflow/contrib/distributions/python/ops/bijector.py b/tensorflow/contrib/distributions/python/ops/bijector.py
new file mode 100644
index 00000000000..ff54b6d386e
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijector.py
@@ -0,0 +1,350 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An API for reversible (bijective) transformations of random variables."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+class _Bijector(object):
+  """An interface for transforming random variable(s).
+
+  A bijector is characterized by three operations:
+
+  1) Forward Evaluation
+     Useful for turning one random outcome into another random outcome from a
+     different distribution.
+
+  2) Inverse Evaluation
+     Useful for "reversing" a transformation to compute one probability in terms
+     of another.
+
+  3) (log o det o Jacobian o inverse)(x)
+     "The log of the determinant of the matrix of all first-order partial
+     derivatives of the inverse function."
+     Useful for inverting a transformation to compute one probability in terms
+     of another.  Geometrically, the det(Jacobian) is the volume of the
+     transformation and is used to scale the probability.
+
+  By convention, transformations of random variables are named in terms of the
+  forward transformation. The forward transformation creates samples, the
+  inverse is useful for computing probabilities.
+
+  Example transformations:
+    "Exponential"
+
+      ```
+      Y = g(X) = exp(X)
+      X ~ Normal(0, 1)  # Univariate.
+      ```
+
+      Implies:
+
+      ```
+        g^{-1}(Y) = log(Y)
+        |Jacobian(g^{-1})(y)| = 1 / y
+        Y ~ LogNormal(0, 1), i.e.,
+        prob(Y=y) = |Jacobian(g^{-1})(y)| * prob(X=g^{-1}(y))
+                  = (1 / y) Normal(log(y); 0, 1)
+      ```
+
+    "ShiftAndScale"
+
+      ```
+      Y = g(X) = sqrtSigma * X + mu
+      X ~ MultivariateNormal(0, I_d)
+      ```
+
+      Implies:
+
+      ```
+        g^{-1}(Y) = inv(sqrtSigma) * (Y - mu)
+        |Jacobian(g^{-1})(y)| = det(inv(sqrtSigma))
+        Y ~ MultivariateNormal(mu, sqrtSigma) , i.e.,
+        prob(Y=y) = |Jacobian(g^{-1})(y)| * prob(X=g^{-1}(y))
+                  = det(sqrtSigma)^(-d) *
+                    MultivariateNormal(inv(sqrtSigma) * (y - mu); 0, I_d)
+      ```
+
+  Example use:
+    Basic properties:
+
+    ```python
+    x = ... # A tensor.
+    # Evaluate forward transformation.
+    fwd_x = my_bijector.forward(x)
+    x != my_bijector.forward(fwd_x)  # Not equal because g(x) != g(g(x)).
+    x == my_bijector.inverse(fwd_x)
+    ```
+
+    Computing a log-likelihood:
+
+    ```python
+    def transformed_log_pdf(bijector, log_pdf, x):
+      return (bijector.inverse_log_det_jacobian(x) +
+              log_pdf(bijector.inverse(x)))
+    ```
+
+    Transforming a random outcome:
+
+    ```python
+    def transformed_sample(bijector, x):
+      return bijector.forward(x)
+    ```
+
+  """
+
+  # TODO(b/30476956): Try to remove constructor dependence on shape util.
+  def __init__(self, shaper=None, name=None):
+    """Constructs Bijector.
+
+    A bijector transforms random variables into new random variables. Managing
+    shape is typically an important piece of this so a Bijector is usually
+    composed of ShapeUtil. The ShapeUtil object handles input shape checks as
+    well as reshaping/transposing for easier linear algebra operations.
+
+    Example:
+    ```python
+    # Create the Y = g(X) = X transform which operates on 4-Tensors of vectors.
+    identity = Identity(ShapeUtil(batch_ndims=4, event_ndims=1))
+
+    # Create the Y = g(X) = exp(X) transform which operates on matrices.
+    exp = Exp(ShapeUtil(batch_ndims=0, event_ndims=2))
+    ```
+
+    See Bijector subclass doc for more details and examples.
+
+    Args:
+      shaper: object used for managing and manipulating shape, typically an
+        instance of ShapeUtil.
+      name: The name to give Ops created by the initializer.
+    """
+    self._shaper = shaper
+    self._name = name or type(self).__name__
+
+  @property
+  def shaper(self):
+    """Returns shape object used to manage shape constraints."""
+    return self._shaper
+
+  @property
+  def name(self):
+    """Returns the string name of this bijector."""
+    return self._name
+
+  def forward(self, x, name='forward'):
+    """Returns the forward bijector evaluation, i.e., X = g(Y).
+
+    Args:
+      x: `Tensor`. The input to the "forward" evaluation.
+      name: The name to give this op.
+
+    Returns:
+      `Tensor`.
+    """
+    with ops.name_scope(self.name):
+      with ops.op_scope([x], name):
+        x = ops.convert_to_tensor(x)
+        return self._forward(x)
+
+  def inverse(self, x, name='inverse'):
+    """Returns the inverse bijector evaluation, i.e., X = g^{-1}(Y).
+
+    Args:
+      x: `Tensor`. The input to the "inverse" evaluation.
+      name: The name to give this op.
+
+    Returns:
+      `Tensor`.
+    """
+    with ops.name_scope(self.name):
+      with ops.op_scope([x], name):
+        x = ops.convert_to_tensor(x)
+        try:
+          return self._inverse(x)
+        except NotImplementedError:
+          return self._inverse_and_inverse_log_det_jacobian(x)[0]
+
+  def inverse_log_det_jacobian(self, x, name='inverse_log_det_jacobian'):
+    """Returns the (log o det o Jacobian o inverse)(x).
+
+    Mathematically, returns: log(det(dY/dX g^{-1}))(Y).
+
+    Args:
+      x: `Tensor`. The input to the "inverse" Jacobian evaluation.
+      name: The name to give this op.
+
+    Returns:
+      `Tensor`.
+    """
+    with ops.name_scope(self.name):
+      with ops.op_scope([x], name):
+        x = ops.convert_to_tensor(x)
+        try:
+          return self._inverse_log_det_jacobian(x)
+        except NotImplementedError:
+          return self._inverse_and_inverse_log_det_jacobian(x)[1]
+
+  def inverse_and_inverse_log_det_jacobian(
+      self, x, name='inverse_and_inverse_log_det_jacobian'):
+    """Returns both the inverse evaluation and inverse_log_det_jacobian.
+
+    Enables possibly more efficient calculation when both inverse and
+    corresponding Jacobian are needed.
+
+    See `inverse()`, `inverse_log_det_jacobian()` for more details.
+
+    Args:
+      x: `Tensor`. The input to the "inverse" Jacobian evaluation.
+      name: The name to give this op.
+
+    Returns:
+      `Tensor`.
+    """
+    with ops.name_scope(self.name):
+      with ops.op_scope([x], name):
+        x = ops.convert_to_tensor(x)
+        try:
+          return self._inverse_and_inverse_log_det_jacobian(x)
+        except NotImplementedError:
+          return self._inverse(x), self._inverse_log_det_jacobian(x)
+
+  # Subclass interface.
+  def _forward(self, x):
+    """Subclass implementation of forward().
+
+    Args:
+      x: `Tensor`. The input to the "forward" evaluation.
+
+    Raises:
+      `NotImplementedError`: if subclass implementation not provided
+
+    Returns:
+      `Tensor`.
+    """
+    raise NotImplementedError('_forward not implemented')
+
+  def _inverse(self, x):
+    """Subclass implementation of inverse().
+
+    Args:
+      x: `Tensor`. The input to the "inverse" evaluation.
+
+    Raises:
+      `NotImplementedError`: if subclass implementation not provided
+
+    Returns:
+      `Tensor`.
+    """
+    raise NotImplementedError('_inverse not implemented')
+
+  def _inverse_log_det_jacobian(self, x):
+    """Subclass implementation of inverse_log_det_jacobian().
+
+    Args:
+      x: `Tensor`. The input to the "inverse" Jacobian evaluation.
+
+    Raises:
+      `NotImplementedError`: if subclass implementation not provided
+
+    Returns:
+      `Tensor`.
+    """
+    raise NotImplementedError('_inverse_log_det_jacobian not implemented')
+
+  def _inverse_and_inverse_log_det_jacobian(self, x):
+    """Subclass implementation of inverse_and_inverse_log_det_jacobian().
+
+    Args:
+      x: `Tensor`. The input to the "inverse" evaluation.
+
+    Returns:
+      List of two `Tensor` items, inverse and inverse_log_det_jacobian.
+    """
+    raise NotImplementedError(
+        '_inverse_and_inverse_log_det_jacobian not implemented')
+
+
+class _Identity(_Bijector):
+  """Bijector which computes Y = g(X) = X.
+
+    Example Use:
+    ```python
+    # Create the Y=g(X)=X transform which works only on Tensors with 1 batch
+    # ndims and 1 event ndim (i.e., vector of vectors).
+    identity = Identity(ShapeUtil(batch_ndims=1, event_ndims=1))
+    x = [[1., 2],
+         [3, 4]]
+    x == identity.forward(x) == identity.inverse(x)
+    ```
+
+  """
+
+  # TODO(b/30476956): Try to remove constructor dependence on shape util.
+  def __init__(self, shaper=None, name='Identity'):
+    super(_Identity, self).__init__(shaper, name)
+
+  def _forward(self, x):
+    return x
+
+  def _inverse(self, x):
+    return x
+
+  def _inverse_log_det_jacobian(self, x):
+    result_shape = self.shaper.get_shape(
+        x, sample=True, batch=True, event=False)
+    return array_ops.zeros(result_shape, dtype=x.dtype)
+
+
+class _Exp(_Bijector):
+  """Bijector which computes Y = g(X) = exp(X).
+
+    Example Use:
+    ```python
+    # Create the Y=g(X)=exp(X) transform which works only on Tensors with 1
+    # batch ndims and 2 event ndim (i.e., vector of matrices).
+    exp = Exp(ShapeUtil(batch_ndims=1, event_ndims=2))
+    x = [[[1., 2],
+           [3, 4]],
+          [[5, 6],
+           [7, 8]]]
+    exp(x) == exp.forward(x)
+    log(x) == exp.inverse(x)
+    ```
+
+  """
+
+  # TODO(b/30476956): Try to remove constructor dependence on shape util.
+  def __init__(self, shaper=None, name='Exp'):
+    super(_Exp, self).__init__(shaper, name)
+
+  def _forward(self, x):
+    return math_ops.exp(x)
+
+  def _inverse(self, x):
+    return math_ops.log(x)
+
+  def _inverse_log_det_jacobian(self, x):
+    d = self.shaper.get_event_dims(x)
+    return -math_ops.reduce_sum(math_ops.log(x), d)
+
+  def _inverse_and_inverse_log_det_jacobian(self, x):
+    y = math_ops.log(x)
+    d = self.shaper.get_event_dims(x)
+    return y, -math_ops.reduce_sum(y, d)
diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py
new file mode 100644
index 00000000000..9978d0ad613
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/binomial.py
@@ -0,0 +1,340 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Binomial distribution class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=line-too-long
+
+from tensorflow.contrib.distributions.python.ops import distribution
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+
+# pylint: enable=line-too-long
+
+
+class Binomial(distribution.Distribution):
+  """Binomial distribution.
+
+  This distribution is parameterized by a vector `p` of probabilities and `n`,
+  the total counts.
+
+  #### Mathematical details
+
+  The Binomial is a distribution over the number of successes in `n` independent
+  trials, with each trial having the same probability of success `p`.
+  The probability mass function (pmf):
+
+  ```pmf(k) = n! / (k! * (n - k)!) * (p)^k * (1 - p)^(n - k)```
+
+  #### Examples
+
+  Create a single distribution, corresponding to 5 coin flips.
+
+  ```python
+  dist = Binomial(n=5., p=.5)
+  ```
+
+  Create a single distribution (using logits), corresponding to 5 coin flips.
+
+  ```python
+  dist = Binomial(n=5., logits=0.)
+  ```
+
+  Creates 3 distributions with the third distribution most likely to have
+  successes.
+
+  ```python
+  p = [.2, .3, .8]
+  # n will be broadcast to [4., 4., 4.], to match p.
+  dist = Binomial(n=4., p=p)
+  ```
+
+  The distribution functions can be evaluated on counts.
+
+  ```python
+  # counts same shape as p.
+  counts = [1., 2, 3]
+  dist.prob(counts)  # Shape [3]
+
+  # p will be broadcast to [[.2, .3, .8], [.2, .3, .8]] to match counts.
+  counts = [[1., 2, 1], [2, 2, 4]]
+  dist.prob(counts)  # Shape [2, 3]
+
+  # p will be broadcast to shape [5, 7, 3] to match counts.
+  counts = [[...]]  # Shape [5, 7, 3]
+  dist.prob(counts)  # Shape [5, 7, 3]
+  ```
+  """
+
+  def __init__(self,
+               n,
+               logits=None,
+               p=None,
+               validate_args=True,
+               allow_nan_stats=False,
+               name="Binomial"):
+    """Initialize a batch of Binomial distributions.
+
+    Args:
+      n:  Non-negative floating point tensor with shape broadcastable to
+        `[N1,..., Nm]` with `m >= 0` and the same dtype as `p` or `logits`.
+        Defines this as a batch of `N1 x ... x Nm` different Binomial
+        distributions. Its components should be equal to integer values.
+      logits: Floating point tensor representing the log-odds of a
+        positive event with shape broadcastable to `[N1,..., Nm]` `m >= 0`, and
+        the same dtype as `n`. Each entry represents logits for the probability
+        of success for independent Binomial distributions.
+      p:  Positive floating point tensor with shape broadcastable to
+        `[N1,..., Nm]` `m >= 0`, `p in [0, 1]`. Each entry represents the
+        probability of success for independent Binomial distributions.
+      validate_args: Whether to assert valid values for parameters `n` and `p`,
+        and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
+        guaranteed.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
+      name: The name to prefix Ops created by this distribution class.
+
+    Examples:
+
+    ```python
+    # Define 1-batch of a binomial distribution.
+    dist = Binomial(n=2., p=.9)
+
+    # Define a 2-batch.
+    dist = Binomial(n=[4., 5], p=[.1, .3])
+    ```
+
+    """
+
+    self._logits, self._p = distribution_util.get_logits_and_prob(
+        name=name, logits=logits, p=p, validate_args=validate_args)
+
+    with ops.op_scope([n], name):
+      with ops.control_dependencies([
+          check_ops.assert_non_negative(
+              n, message="n has negative components."),
+          distribution_util.assert_integer_form(
+              n, message="n has non-integer components."
+          )] if validate_args else []):
+        self._n = array_ops.identity(n, name="convert_n")
+
+        self._name = name
+        self._validate_args = validate_args
+        self._allow_nan_stats = allow_nan_stats
+
+        self._mean = self._n * self._p
+        self._get_batch_shape = self._mean.get_shape()
+        self._get_event_shape = tensor_shape.TensorShape([])
+
+  @property
+  def name(self):
+    """Name to prepend to all ops."""
+    return self._name
+
+  @property
+  def dtype(self):
+    """dtype of samples from this distribution."""
+    return self._p.dtype
+
+  @property
+  def validate_args(self):
+    """Boolean describing behavior on invalid input."""
+    return self._validate_args
+
+  @property
+  def allow_nan_stats(self):
+    """Boolean describing behavior when a stat is undefined for batch member."""
+    return self._allow_nan_stats
+
+  def batch_shape(self, name="batch_shape"):
+    """Batch dimensions of this instance as a 1-D int32 `Tensor`.
+
+    The product of the dimensions of the `batch_shape` is the number of
+    independent distributions of this kind the instance represents.
+
+    Args:
+      name: name to give to the op
+
+    Returns:
+      `Tensor` `batch_shape`
+    """
+    return array_ops.shape(self._mean)
+
+  def get_batch_shape(self):
+    """`TensorShape` available at graph construction time.
+
+    Same meaning as `batch_shape`. May be only partially defined.
+
+    Returns:
+      batch shape
+    """
+    return self._get_batch_shape
+
+  def event_shape(self, name="event_shape"):
+    """Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
+
+    Args:
+      name: name to give to the op
+
+    Returns:
+      `Tensor` `event_shape`
+    """
+    with ops.name_scope(self.name):
+      with ops.op_scope([], name):
+        return constant_op.constant([], name=name, dtype=dtypes.int32)
+
+  def get_event_shape(self):
+    """`TensorShape` available at graph construction time.
+
+    Same meaning as `event_shape`. May be only partially defined.
+
+    Returns:
+      event shape
+    """
+    return self._get_event_shape
+
+  @property
+  def n(self):
+    """Number of trials."""
+    return self._n
+
+  @property
+  def logits(self):
+    """Log-odds."""
+    return self._logits
+
+  @property
+  def p(self):
+    """Probability of success."""
+    return self._p
+
+  def mean(self, name="mean"):
+    """Mean of the distribution."""
+    with ops.name_scope(self.name):
+      return array_ops.identity(self._mean, name=name)
+
+  def variance(self, name="variance"):
+    """Variance of the distribution."""
+    with ops.name_scope(self.name):
+      with ops.op_scope([self._n, self._p], name):
+        return self._n * self._p * (1 - self._p)
+
+  def std(self, name="std"):
+    """Standard deviation of the distribution."""
+    with ops.name_scope(self.name):
+      with ops.op_scope([self._n, self._p], name):
+        return math_ops.sqrt(self.variance())
+
+  def mode(self, name="mode"):
+    """Mode of the distribution.
+
+    Note that when `(n + 1) * p` is an integer, there are actually two modes.
+    Namely, `(n + 1) * p` and `(n + 1) * p - 1` are both modes. Here we return
+    only the larger of the two modes.
+
+    Args:
+      name: The name for this op.
+
+    Returns:
+      The mode of the Binomial distribution.
+    """
+    with ops.name_scope(self.name):
+      with ops.op_scope([self._n, self._p], name):
+        return math_ops.floor((self._n + 1) * self._p)
+
+  def log_prob(self, counts, name="log_prob"):
+    """`Log(P[counts])`, computed for every batch member.
+
+    For each batch member of counts `k`, `P[counts]` is the probability that
+    after sampling `n` draws from this Binomial distribution, the number of
+    successes is `k`.  Note that different sequences of draws can result in the
+    same counts, thus the probability includes a combinatorial coefficient.
+
+    Args:
+      counts:  Non-negative tensor with dtype `dtype` and whose shape can be
+        broadcast with `self.p` and `self.n`. `counts` is only legal if it is
+        less than or equal to `n` and its components are equal to integer
+        values.
+      name:  Name to give this Op, defaults to "log_prob".
+
+    Returns:
+      Log probabilities for each record, shape `[N1,...,Nm]`.
+    """
+    n = self._n
+    p = self._p
+    with ops.name_scope(self.name):
+      with ops.op_scope([self._n, self._p, counts], name):
+        counts = self._check_counts(counts)
+
+        prob_prob = counts * math_ops.log(p) + (
+            n - counts) * math_ops.log(1 - p)
+
+        combinations = math_ops.lgamma(n + 1) - math_ops.lgamma(
+            counts + 1) - math_ops.lgamma(n - counts + 1)
+        log_prob = prob_prob + combinations
+        return log_prob
+
+  def prob(self, counts, name="prob"):
+    """`P[counts]`, computed for every batch member.
+
+
+    For each batch member of counts `k`, `P[counts]` is the probability that
+    after sampling `n` draws from this Binomial distribution, the number of
+    successes is `k`.  Note that different sequences of draws can result in the
+    same counts, thus the probability includes a combinatorial coefficient.
+
+    Args:
+      counts:  Non-negative tensor with dtype `dtype` and whose shape can be
+        broadcast with `self.p` and `self.n`. `counts` is only legal if it is
+        less than or equal to `n` and its components are equal to integer
+        values.
+      name:  Name to give this Op, defaults to "prob".
+
+    Returns:
+      Probabilities for each record, shape `[N1,...,Nm]`.
+    """
+    return super(Binomial, self).prob(counts, name=name)
+
+  @property
+  def is_continuous(self):
+    return False
+
+  @property
+  def is_reparameterized(self):
+    return False
+
+  def _check_counts(self, counts):
+    """Check counts for proper shape, values, then return tensor version."""
+    counts = ops.convert_to_tensor(counts, name="counts_before_deps")
+    if not self.validate_args:
+      return counts
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_non_negative(
+            counts, message="counts has negative components."),
+        check_ops.assert_less_equal(
+            counts, self._n, message="counts are not less than or equal to n."),
+        distribution_util.assert_integer_form(
+            counts, message="counts have non-integer components.")], counts)
diff --git a/tensorflow/contrib/distributions/python/ops/categorical.py b/tensorflow/contrib/distributions/python/ops/categorical.py
index 64572ed7885..e79a732a0c9 100644
--- a/tensorflow/contrib/distributions/python/ops/categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/categorical.py
@@ -34,11 +34,6 @@ class Categorical(distribution.Distribution):
 
   The categorical distribution is parameterized by the log-probabilities
   of a set of classes.
-
-  Note, the following methods of the base class aren't implemented:
-    * mean
-    * cdf
-    * log_cdf
   """
 
   def __init__(
@@ -57,10 +52,10 @@ class Categorical(distribution.Distribution):
           indexes into the classes.
       dtype: The type of the event samples (default: int32).
       validate_args: Unused in this distribution.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: A name for this distribution (optional).
     """
     self._allow_nan_stats = allow_nan_stats
@@ -177,8 +172,7 @@ class Categorical(distribution.Distribution):
         samples = math_ops.cast(samples, self._dtype)
         ret = array_ops.reshape(
             array_ops.transpose(samples),
-            array_ops.concat(
-                0, [array_ops.expand_dims(n, 0), self.batch_shape()]))
+            array_ops.concat(0, ([n], self.batch_shape())))
         ret.set_shape(tensor_shape.vector(tensor_util.constant_value(n))
                       .concatenate(self.get_batch_shape()))
         return ret
diff --git a/tensorflow/contrib/distributions/python/ops/chi2.py b/tensorflow/contrib/distributions/python/ops/chi2.py
index 65840373f12..e09ef6324b8 100644
--- a/tensorflow/contrib/distributions/python/ops/chi2.py
+++ b/tensorflow/contrib/distributions/python/ops/chi2.py
@@ -42,15 +42,15 @@ class Chi2(gamma.Gamma):
     """Construct Chi2 distributions with parameter `df`.
 
     Args:
-      df: `float` or `double` tensor, the degrees of freedom of the
+      df: Floating point tensor, the degrees of freedom of the
         distribution(s).  `df` must contain only positive values.
       validate_args: Whether to assert that `df > 0`, and that `x > 0` in the
-        methods `prob(x)` and `log_prob(x)`. If `validate_args` is False
+        methods `prob(x)` and `log_prob(x)`. If `validate_args` is `False`
         and the inputs are invalid, correct behavior is not guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prepend to all ops created by this distribution.
     """
     # Even though all stats of chi2 are defined for valid parameters, this is
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet.py b/tensorflow/contrib/distributions/python/ops/dirichlet.py
index b4f59d5bd8c..25aee5cf03e 100644
--- a/tensorflow/contrib/distributions/python/ops/dirichlet.py
+++ b/tensorflow/contrib/distributions/python/ops/dirichlet.py
@@ -19,9 +19,8 @@ from __future__ import print_function
 
 # pylint: disable=line-too-long
 
-import numpy as np
-
 from tensorflow.contrib.distributions.python.ops import distribution
+from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -29,7 +28,6 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
@@ -37,24 +35,6 @@ from tensorflow.python.ops import special_math_ops
 # pylint: enable=line-too-long
 
 
-def _assert_close(x, y, data=None, summarize=None, name=None):
-  if x.dtype.is_integer:
-    return check_ops.assert_equal(
-        x, y, data=data, summarize=summarize, name=name)
-
-  with ops.op_scope([x, y, data], name, "assert_close"):
-    x = ops.convert_to_tensor(x, name="x")
-    y = ops.convert_to_tensor(y, name="y")
-    tol = np.finfo(x.dtype.as_numpy_dtype).resolution
-    if data is None:
-      data = [
-          "Condition x ~= y did not hold element-wise: x = ", x.name, x, "y = ",
-          y.name, y
-      ]
-    condition = math_ops.reduce_all(math_ops.less_equal(math_ops.abs(x-y), tol))
-    return logging_ops.Assert(condition, data, summarize=summarize)
-
-
 class Dirichlet(distribution.Distribution):
   """Dirichlet distribution.
 
@@ -117,6 +97,7 @@ class Dirichlet(distribution.Distribution):
   x = [.2, .3, .5]
   dist.prob(x)  # Shape [2]
   ```
+
   """
 
   def __init__(self,
@@ -127,16 +108,16 @@ class Dirichlet(distribution.Distribution):
     """Initialize a batch of Dirichlet distributions.
 
     Args:
-      alpha:  Positive `float` or `double` tensor with shape broadcastable to
+      alpha:  Positive floating point tensor with shape broadcastable to
         `[N1,..., Nm, k]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
          different `k` class Dirichlet distributions.
       validate_args: Whether to assert valid values for parameters `alpha` and
-        `x` in `prob` and `log_prob`.  If False, correct behavior is not
+        `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
         guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prefix Ops created by this distribution class.
 
     Examples:
@@ -149,6 +130,7 @@ class Dirichlet(distribution.Distribution):
     # Define a 2-batch of 3-class distributions.
     dist = Dirichlet([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
     ```
+
     """
     with ops.op_scope([alpha], name):
       alpha = ops.convert_to_tensor(alpha, name="alpha_before_deps")
@@ -302,7 +284,9 @@ class Dirichlet(distribution.Distribution):
                array_ops.ones_like(self._alpha, dtype=self.dtype)))
         else:
           return control_flow_ops.with_dependencies([
-              check_ops.assert_less(one, self._alpha)
+              check_ops.assert_less(
+                  one, self._alpha,
+                  message="mode not defined for components of alpha <= 1")
           ], mode)
 
   def entropy(self, name="entropy"):
@@ -334,7 +318,7 @@ class Dirichlet(distribution.Distribution):
     """`Log(P[counts])`, computed for every batch member.
 
     Args:
-      x:  Non-negative `float` or `double`, tensor whose shape can
+      x:  Non-negative tensor with dtype `dtype` and whose shape can
         be broadcast with `self.alpha`.  For fixed leading dimensions, the last
         dimension represents counts for the corresponding Dirichlet distribution
         in `self.alpha`. `x` is only legal if it sums up to one.
@@ -359,7 +343,7 @@ class Dirichlet(distribution.Distribution):
     """`P[x]`, computed for every batch member.
 
     Args:
-      x:  Non-negative `float`, `double` tensor whose shape can
+      x:  Non-negative tensor with dtype `dtype` and whose shape can
         be broadcast with `self.alpha`.  For fixed leading dimensions, the last
         dimension represents x for the corresponding Dirichlet distribution in
         `self.alpha` and `self.beta`. `x` is only legal if it sums up to one.
@@ -407,7 +391,8 @@ class Dirichlet(distribution.Distribution):
     x = ops.convert_to_tensor(x, name="x_before_deps")
     candidate_one = math_ops.reduce_sum(x, reduction_indices=[-1])
     one = constant_op.constant(1., self.dtype)
-    dependencies = [check_ops.assert_positive(x), check_ops.assert_less(x, one),
-                    _assert_close(one, candidate_one)
+    dependencies = [check_ops.assert_positive(x), check_ops.assert_less(
+        x, one, message="x has components greater than or equal to 1"),
+                    distribution_util.assert_close(one, candidate_one)
                    ] if self.validate_args else []
     return control_flow_ops.with_dependencies(dependencies, x)
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py b/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
index 6982a733813..67cdd566c67 100644
--- a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
+++ b/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
@@ -13,13 +13,15 @@
 # limitations under the License.
 # ==============================================================================
 """The Dirichlet Multinomial distribution class."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=line-too-long
 
-from tensorflow.contrib.distributions.python.ops import distribution  # pylint: disable=line-too-long
+from tensorflow.contrib.distributions.python.ops import distribution
+from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -30,34 +32,6 @@ from tensorflow.python.ops import special_math_ops
 # pylint: enable=line-too-long
 
 
-def _assert_integer_form(x):
-  """Check x for integer components (or floats that are equal to integers)."""
-  x = ops.convert_to_tensor(x, name='x')
-  casted_x = math_ops.to_int64(x)
-  return check_ops.assert_equal(x, math_ops.cast(
-      math_ops.round(casted_x), x.dtype))
-
-
-def _log_combinations(n, counts, name='log_combinations'):
-  """Log number of ways counts could have come in."""
-  # First a bit about the number of ways counts could have come in:
-  # E.g. if counts = [1, 2], then this is 3 choose 2.
-  # In general, this is (sum counts)! / sum(counts!)
-  # The sum should be along the last dimension of counts.  This is the
-  # "distribution" dimension. Here n a priori represents the sum of counts.
-  with ops.op_scope([counts], name):
-    # To compute factorials, use the fact that Gamma(n + 1) = n!
-    # Compute two terms, each a sum over counts.  Compute each for each
-    # batch member.
-    # Log Gamma((sum counts) + 1) = Log((sum counts)!)
-    total_permutations = math_ops.lgamma(n + 1)
-    # sum(Log Gamma(counts + 1)) = Log sum(counts!)
-    counts_factorial = math_ops.lgamma(counts + 1)
-    redundant_permutations = math_ops.reduce_sum(counts_factorial,
-                                                 reduction_indices=[-1])
-    return total_permutations - redundant_permutations
-
-
 class DirichletMultinomial(distribution.Distribution):
   """DirichletMultinomial mixture distribution.
 
@@ -126,38 +100,35 @@ class DirichletMultinomial(distribution.Distribution):
   counts = [2, 1, 0]
   dist.pmf(counts)  # Shape [2]
   ```
+
   """
 
   # TODO(b/27419586) Change docstring for dtype of alpha once int allowed.
   def __init__(self,
                n,
                alpha,
-               allow_arbitrary_counts=False,
                validate_args=True,
                allow_nan_stats=False,
-               name='DirichletMultinomial'):
+               name="DirichletMultinomial"):
     """Initialize a batch of DirichletMultinomial distributions.
 
     Args:
-      n:  Non-negative `float` or `double` tensor with shape
-        broadcastable to `[N1,..., Nm]` with `m >= 0`.  Defines this as a batch
-        of `N1 x ... x Nm` different Dirichlet multinomial distributions. Its
-        components should be equal to integral values.
-      alpha:  Positive `float` or `double` tensor with shape broadcastable to
-        `[N1,..., Nm, k]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
-         different `k` class Dirichlet multinomial distributions.
-      allow_arbitrary_counts: Boolean. This represents whether the pmf/cdf
-        allows for the `counts` tensor to be non-integral values.
-        The pmf/cdf are functions that can be evaluated at non-integral values,
-        but are only a distribution over non-negative integers.  If
-        `validate_args` is `False`, this assertion is turned off.
+      n:  Non-negative floating point tensor, whose dtype is the same as
+        `alpha`. The shape is broadcastable to `[N1,..., Nm]` with `m >= 0`.
+        Defines this as a batch of `N1 x ... x Nm` different Dirichlet
+        multinomial distributions. Its components should be equal to integer
+        values.
+      alpha: Positive floating point tensor, whose dtype is the same as
+        `n` with shape broadcastable to `[N1,..., Nm, k]` `m >= 0`.  Defines
+        this as a batch of `N1 x ... x Nm` different `k` class Dirichlet
+        multinomial distributions.
       validate_args: Whether to assert valid values for parameters `alpha` and
-        `n`, and `x` in `prob` and `log_prob`.  If False, correct behavior is
+        `n`, and `x` in `prob` and `log_prob`.  If `False`, correct behavior is
         not guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prefix Ops created by this distribution class.
 
     Examples:
@@ -170,11 +141,11 @@ class DirichletMultinomial(distribution.Distribution):
     # Define a 2-batch of 3-class distributions.
     dist = DirichletMultinomial([3., 4], [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
     ```
+
     """
     self._allow_nan_stats = allow_nan_stats
     self._validate_args = validate_args
     self._name = name
-    self._allow_arbitrary_counts = allow_arbitrary_counts
     with ops.op_scope([n, alpha], name):
       # Broadcasting works because:
       # * The broadcasting convention is to prepend dimensions of size [1], and
@@ -186,8 +157,7 @@ class DirichletMultinomial(distribution.Distribution):
       #   * All calls involving `counts` eventually require a broadcast between
       #   `counts` and alpha.
       self._alpha = self._check_alpha(alpha)
-      n = self._check_n(n)
-      self._n = math_ops.cast(n, self._alpha.dtype)
+      self._n = self._check_n(n)
 
       self._alpha_sum = math_ops.reduce_sum(
           self._alpha, reduction_indices=[-1], keep_dims=False)
@@ -227,7 +197,7 @@ class DirichletMultinomial(distribution.Distribution):
     """dtype of samples from this distribution."""
     return self._alpha.dtype
 
-  def mean(self, name='mean'):
+  def mean(self, name="mean"):
     """Class means for every batch member."""
     alpha = self._alpha
     alpha_sum = self._alpha_sum
@@ -237,7 +207,7 @@ class DirichletMultinomial(distribution.Distribution):
         mean_no_n = alpha / array_ops.expand_dims(alpha_sum, -1)
         return array_ops.expand_dims(n, -1) * mean_no_n
 
-  def variance(self, name='mean'):
+  def variance(self, name="mean"):
     """Class variances for every batch member.
 
     The variance for each batch member is defined as the following:
@@ -279,7 +249,7 @@ class DirichletMultinomial(distribution.Distribution):
         variance *= array_ops.expand_dims(shared_factor, -1)
         return variance
 
-  def batch_shape(self, name='batch_shape'):
+  def batch_shape(self, name="batch_shape"):
     """Batch dimensions of this instance as a 1-D int32 `Tensor`.
 
     The product of the dimensions of the `batch_shape` is the number of
@@ -305,7 +275,7 @@ class DirichletMultinomial(distribution.Distribution):
     """
     return self._get_batch_shape
 
-  def event_shape(self, name='event_shape'):
+  def event_shape(self, name="event_shape"):
     """Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
 
     Args:
@@ -328,15 +298,15 @@ class DirichletMultinomial(distribution.Distribution):
     """
     return self._get_event_shape
 
-  def cdf(self, x, name='cdf'):
+  def cdf(self, x, name="cdf"):
     raise NotImplementedError(
-        'DirichletMultinomial does not have a well-defined cdf.')
+        "DirichletMultinomial does not have a well-defined cdf.")
 
-  def log_cdf(self, x, name='log_cdf'):
+  def log_cdf(self, x, name="log_cdf"):
     raise NotImplementedError(
-        'DirichletMultinomial does not have a well-defined cdf.')
+        "DirichletMultinomial does not have a well-defined cdf.")
 
-  def log_prob(self, counts, name='log_prob'):
+  def log_prob(self, counts, name="log_prob"):
     """`Log(P[counts])`, computed for every batch member.
 
     For each batch of counts `[n_1,...,n_k]`, `P[counts]` is the probability
@@ -346,12 +316,11 @@ class DirichletMultinomial(distribution.Distribution):
     probability includes a combinatorial coefficient.
 
     Args:
-      counts:  Non-negative `float` or `double` tensor whose shape can
-        be broadcast with `self.alpha`.  For fixed leading dimensions, the last
+      counts:  Non-negative tensor with dtype `dtype` and whose shape can be
+        broadcast with `self.alpha`.  For fixed leading dimensions, the last
         dimension represents counts for the corresponding Dirichlet Multinomial
         distribution in `self.alpha`. `counts` is only legal if it sums up to
-        `n` and its components are equal to integral values. The second
-        condition is relaxed if `allow_arbitrary_counts` is set.
+        `n` and its components are equal to integer values.
       name:  Name to give this Op, defaults to "log_prob".
 
     Returns:
@@ -362,25 +331,14 @@ class DirichletMultinomial(distribution.Distribution):
     with ops.name_scope(self.name):
       with ops.op_scope([n, alpha, counts], name):
         counts = self._check_counts(counts)
-        # Use the same dtype as alpha for computations.
-        counts = math_ops.cast(counts, self.dtype)
 
         ordered_prob = (special_math_ops.lbeta(alpha + counts) -
                         special_math_ops.lbeta(alpha))
-        log_prob = ordered_prob + _log_combinations(n, counts)
-        # If alpha = counts = [[]], ordered_prob carries the right shape, which
-        # is [].  However, since reduce_sum([[]]) = [0], log_combinations = [0],
-        # which is not correct.  Luckily, [] + [0] = [], so the sum is fine, but
-        # shape must be inferred from ordered_prob. We must also make this
-        # broadcastable with n, so this is multiplied by n to ensure the shape
-        # is correctly inferred.
-        # Note also that tf.constant([]).get_shape() =
-        # TensorShape([Dimension(0)])
-        broadcasted_tensor = ordered_prob * n
-        log_prob.set_shape(broadcasted_tensor.get_shape())
+        log_prob = ordered_prob + distribution_util.log_combinations(
+            n, counts)
         return log_prob
 
-  def prob(self, counts, name='prob'):
+  def prob(self, counts, name="prob"):
     """`P[counts]`, computed for every batch member.
 
     For each batch of counts `[c_1,...,c_k]`, `P[counts]` is the probability
@@ -390,12 +348,11 @@ class DirichletMultinomial(distribution.Distribution):
     probability includes a combinatorial coefficient.
 
     Args:
-      counts:  Non-negative `float`, `double` tensor whose shape can
-        be broadcast with `self.alpha`.  For fixed leading dimensions, the last
+      counts:  Non-negative tensor with dtype `dtype` and whose shape can be
+        broadcast with `self.alpha`.  For fixed leading dimensions, the last
         dimension represents counts for the corresponding Dirichlet Multinomial
         distribution in `self.alpha`. `counts` is only legal if it sums up to
-        `n` and its components are equal to integral values. The second
-        condition is relaxed if `allow_arbitrary_counts` is set.
+        `n` and its components are equal to integer values.
       name:  Name to give this Op, defaults to "prob".
 
     Returns:
@@ -405,21 +362,21 @@ class DirichletMultinomial(distribution.Distribution):
 
   def _check_counts(self, counts):
     """Check counts for proper shape, values, then return tensor version."""
-    counts = ops.convert_to_tensor(counts, name='counts')
+    counts = ops.convert_to_tensor(counts, name="counts")
     if not self.validate_args:
       return counts
     candidate_n = math_ops.reduce_sum(counts, reduction_indices=[-1])
-    dependencies = [check_ops.assert_non_negative(counts),
-                    check_ops.assert_equal(self._n,
-                                           math_ops.cast(candidate_n,
-                                                         self._n.dtype))]
-    if not self._allow_arbitrary_counts:
-      dependencies += [_assert_integer_form(counts)]
 
-    return control_flow_ops.with_dependencies(dependencies, counts)
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_non_negative(counts),
+        check_ops.assert_equal(
+            self._n, candidate_n,
+            message="counts do not sum to n"
+        ),
+        distribution_util.assert_integer_form(counts)], counts)
 
   def _check_alpha(self, alpha):
-    alpha = ops.convert_to_tensor(alpha, name='alpha')
+    alpha = ops.convert_to_tensor(alpha, name="alpha")
     if not self.validate_args:
       return alpha
     return control_flow_ops.with_dependencies(
@@ -427,11 +384,12 @@ class DirichletMultinomial(distribution.Distribution):
          check_ops.assert_positive(alpha)], alpha)
 
   def _check_n(self, n):
-    n = ops.convert_to_tensor(n, name='n')
+    n = ops.convert_to_tensor(n, name="n")
     if not self.validate_args:
       return n
     return control_flow_ops.with_dependencies(
-        [check_ops.assert_non_negative(n), _assert_integer_form(n)], n)
+        [check_ops.assert_non_negative(n),
+         distribution_util.assert_integer_form(n)], n)
 
   @property
   def is_continuous(self):
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
new file mode 100644
index 00000000000..9c751270032
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -0,0 +1,177 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for probability distributions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+
+
+def assert_close(
+    x, y, data=None, summarize=None, message=None, name="assert_close"):
+  """Assert that that x and y are within machine epsilon of each other.
+
+  Args:
+    x: Numeric `Tensor`
+    y: Numeric `Tensor`
+    data: The tensors to print out if the condition is `False`. Defaults to
+      error message and first few entries of `x` and `y`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).
+
+  Returns:
+    Op raising `InvalidArgumentError` if |x - y| > machine epsilon.
+  """
+  message = message or ""
+  x = ops.convert_to_tensor(x, name="x")
+  y = ops.convert_to_tensor(y, name="y")
+
+  if x.dtype.is_integer:
+    return check_ops.assert_equal(
+        x, y, data=data, summarize=summarize, message=message, name=name)
+
+  with ops.op_scope([x, y, data], name, "assert_close"):
+    tol = np.finfo(x.dtype.as_numpy_dtype).resolution
+    if data is None:
+      data = [
+          message,
+          "Condition x ~= y did not hold element-wise: x = ", x.name, x, "y = ",
+          y.name, y
+      ]
+    condition = math_ops.reduce_all(math_ops.less_equal(math_ops.abs(x-y), tol))
+    return logging_ops.Assert(
+        condition, data, summarize=summarize)
+
+
+def assert_integer_form(
+    x, data=None, summarize=None, message=None, name="assert_integer_form"):
+  """Assert that x has integer components (or floats equal to integers).
+
+  Args:
+    x: Numeric `Tensor`
+    data: The tensors to print out if the condition is `False`. Defaults to
+      error message and first few entries of `x` and `y`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).
+
+  Returns:
+    Op raising `InvalidArgumentError` if round(x) != x.
+  """
+
+  message = message or "x has non-integer components"
+  x = ops.convert_to_tensor(x, name="x")
+  casted_x = math_ops.to_int64(x)
+  return check_ops.assert_equal(
+      x, math_ops.cast(math_ops.round(casted_x), x.dtype),
+      data=data, summarize=summarize, message=message, name=name)
+
+
+def get_logits_and_prob(
+    logits=None, p=None, multidimensional=False, validate_args=True, name=None):
+  """Converts logits to probabilities and vice-versa, and returns both.
+
+  Args:
+    logits: Numeric `Tensor` representing log-odds.
+    p: Numeric `Tensor` representing probabilities.
+    multidimensional: Given `p` a [N1, N2, ... k] dimensional tensor,
+      whether the last dimension represents the probability between k classes.
+      This will additionally assert that the values in the last dimension
+      sum to one. If `False`, will instead assert that each value is in
+      `[0, 1]`.
+    validate_args: Whether to assert `0 <= p <= 1` if multidimensional is
+      `False`, otherwise that the last dimension of `p` sums to one.
+    name: A name for this operation (optional).
+
+  Returns:
+    Tuple with `logits` and `p`. If `p` has an entry that is `0` or `1`, then
+    the corresponding entry in the returned logits will be `-Inf` and `Inf`
+    respectively.
+
+  Raises:
+    ValueError: if neither `p` nor `logits` were passed in, or both were.
+  """
+  if p is None and logits is None:
+    raise ValueError("Must pass p or logits.")
+  elif p is not None and logits is not None:
+    raise ValueError("Must pass either p or logits, not both.")
+  elif p is None:
+    with ops.op_scope([logits], name):
+      logits = array_ops.identity(logits, name="logits")
+    with ops.name_scope(name):
+      with ops.name_scope("p"):
+        p = math_ops.sigmoid(logits)
+  elif logits is None:
+    with ops.name_scope(name):
+      with ops.name_scope("p"):
+        p = array_ops.identity(p)
+        if validate_args:
+          one = constant_op.constant(1., p.dtype)
+          dependencies = [check_ops.assert_non_negative(p)]
+          if multidimensional:
+            dependencies += [assert_close(
+                math_ops.reduce_sum(p, reduction_indices=[-1]),
+                one, message="p does not sum to 1.")]
+          else:
+            dependencies += [check_ops.assert_less_equal(
+                p, one, message="p has components greater than 1.")]
+          p = control_flow_ops.with_dependencies(dependencies, p)
+      with ops.name_scope("logits"):
+        logits = math_ops.log(p) - math_ops.log(1. - p)
+  return (logits, p)
+
+
+def log_combinations(n, counts, name="log_combinations"):
+  """Multinomial coefficient.
+
+  Given `n` and `counts`, where `counts` has last dimension `k`, we compute
+  the multinomial coefficient as:
+
+  ```n! / sum_i n_i!```
+
+  where `i` runs over all `k` classes.
+
+  Args:
+    n: Numeric `Tensor` broadcastable with `counts`. This represents `n`
+      outcomes.
+    counts: Numeric `Tensor` broadcastable with `n`. This represents counts
+      in `k` classes, where `k` is the last dimension of the tensor.
+    name: A name for this operation (optional).
+
+  Returns:
+    `Tensor` representing the multinomial coefficient between `n` and `counts`.
+  """
+  # First a bit about the number of ways counts could have come in:
+  # E.g. if counts = [1, 2], then this is 3 choose 2.
+  # In general, this is (sum counts)! / sum(counts!)
+  # The sum should be along the last dimension of counts.  This is the
+  # "distribution" dimension. Here n a priori represents the sum of counts.
+  with ops.op_scope([n, counts], name):
+    total_permutations = math_ops.lgamma(n + 1)
+    counts_factorial = math_ops.lgamma(counts + 1)
+    redundant_permutations = math_ops.reduce_sum(counts_factorial,
+                                                 reduction_indices=[-1])
+    return total_permutations - redundant_permutations
diff --git a/tensorflow/contrib/distributions/python/ops/exponential.py b/tensorflow/contrib/distributions/python/ops/exponential.py
index 13b26a11db2..c1a7eb025ef 100644
--- a/tensorflow/contrib/distributions/python/ops/exponential.py
+++ b/tensorflow/contrib/distributions/python/ops/exponential.py
@@ -46,15 +46,15 @@ class Exponential(gamma.Gamma):
     """Construct Exponential distribution with parameter `lam`.
 
     Args:
-      lam: `float` or `double` tensor, the rate of the distribution(s).
+      lam: Floating point tensor, the rate of the distribution(s).
         `lam` must contain only positive values.
       validate_args: Whether to assert that `lam > 0`, and that `x > 0` in the
-        methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+        methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
         and the inputs are invalid, correct behavior is not guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member. If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prepend to all ops created by this distribution.
     """
     # Even though all statistics of are defined for valid inputs, this is not
@@ -95,13 +95,13 @@ class Exponential(gamma.Gamma):
     broadcast_shape = self._lam.get_shape()
     with ops.op_scope([self.lam, n], name, "ExponentialSample"):
       n = ops.convert_to_tensor(n, name="n")
-      shape = array_ops.concat(
-          0, [array_ops.pack([n]), array_ops.shape(self._lam)])
+      shape = array_ops.concat(0, ([n], array_ops.shape(self._lam)))
       # Sample uniformly-at-random from the open-interval (0, 1).
       sampled = random_ops.random_uniform(
           shape, minval=np.nextafter(
               self.dtype.as_numpy_dtype(0.), self.dtype.as_numpy_dtype(1.)),
           maxval=constant_op.constant(1.0, dtype=self.dtype),
+          seed=seed,
           dtype=self.dtype)
 
       n_val = tensor_util.constant_value(n)
diff --git a/tensorflow/contrib/distributions/python/ops/gamma.py b/tensorflow/contrib/distributions/python/ops/gamma.py
index 1f733ceda16..6bd93877613 100644
--- a/tensorflow/contrib/distributions/python/ops/gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/gamma.py
@@ -69,19 +69,19 @@ class Gamma(distribution.Distribution):
     broadcasting (e.g. `alpha + beta` is a valid operation).
 
     Args:
-      alpha: `float` or `double` tensor, the shape params of the
+      alpha: Floating point tensor, the shape params of the
         distribution(s).
         alpha must contain only positive values.
-      beta: `float` or `double` tensor, the inverse scale params of the
+      beta: Floating point tensor, the inverse scale params of the
         distribution(s).
         beta must contain only positive values.
       validate_args: Whether to assert that `a > 0, b > 0`, and that `x > 0` in
-        the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+        the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
         and the inputs are invalid, correct behavior is not guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prepend to all ops created by this distribution.
 
     Raises:
@@ -213,9 +213,12 @@ class Gamma(distribution.Distribution):
           nan = np.nan * self._ones()
           return math_ops.select(alpha_ge_1, mode_if_defined, nan)
         else:
-          one = ops.convert_to_tensor(1.0, dtype=self.dtype)
+          one = constant_op.constant(1.0, dtype=self.dtype)
           return control_flow_ops.with_dependencies(
-              [check_ops.assert_less(one, alpha)], mode_if_defined)
+              [check_ops.assert_less(
+                  one, alpha,
+                  message="mode not defined for components of alpha <= 1"
+              )], mode_if_defined)
 
   def variance(self, name="variance"):
     """Variance of each batch member."""
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index a23f6df5717..d78e82a7524 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -69,18 +69,18 @@ class InverseGamma(distribution.Distribution):
     broadcasting (e.g. `alpha + beta` is a valid operation).
 
     Args:
-      alpha: `float` or `double` tensor, the shape params of the
+      alpha: Floating point tensor, the shape params of the
         distribution(s).
         alpha must contain only positive values.
-      beta: `float` or `double` tensor, the scale params of the distribution(s).
+      beta: Floating point tensor, the scale params of the distribution(s).
         beta must contain only positive values.
       validate_args: Whether to assert that `a > 0, b > 0`, and that `x > 0` in
-        the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+        the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
         and the inputs are invalid, correct behavior is not guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prepend to all ops created by this distribution.
 
     Raises:
@@ -206,9 +206,12 @@ class InverseGamma(distribution.Distribution):
           nan = np.nan * self._ones()
           return math_ops.select(alpha_gt_1, mean_if_defined, nan)
         else:
-          one = ops.convert_to_tensor(1.0, dtype=self.dtype)
+          one = constant_op.constant(1.0, dtype=self.dtype)
           return control_flow_ops.with_dependencies(
-              [check_ops.assert_less(one, alpha)], mean_if_defined)
+              [check_ops.assert_less(
+                  one, alpha,
+                  message="mean not defined for components of alpha <= 1")],
+              mean_if_defined)
 
   def mode(self, name="mode"):
     """Mode of each batch member.
@@ -250,9 +253,12 @@ class InverseGamma(distribution.Distribution):
           nan = np.nan * self._ones()
           return math_ops.select(alpha_gt_2, var_if_defined, nan)
         else:
-          two = ops.convert_to_tensor(2.0, dtype=self.dtype)
+          two = constant_op.constant(2.0, dtype=self.dtype)
           return control_flow_ops.with_dependencies(
-              [check_ops.assert_less(two, alpha)], var_if_defined)
+              [check_ops.assert_less(
+                  two, alpha,
+                  message="variance not defined for components of alpha <= 2")],
+              var_if_defined)
 
   def log_prob(self, x, name="log_prob"):
     """Log prob of observations in `x` under these InverseGamma distribution(s).
diff --git a/tensorflow/contrib/distributions/python/ops/kullback_leibler.py b/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
index c134ca2cbfd..c1e0b2d2398 100644
--- a/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
+++ b/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
@@ -34,9 +34,9 @@ def kl(dist_a, dist_b, allow_nan=False, name=None):
   Args:
     dist_a: instance of distributions.Distribution.
     dist_b: instance of distributions.Distribution.
-    allow_nan: If False (default), a runtime error is raised
+    allow_nan: If `False` (default), a runtime error is raised
       if the KL returns NaN values for any batch entry of the given
-      distributions.  If True, the KL may return a NaN for the given entry.
+      distributions.  If `True`, the KL may return a NaN for the given entry.
     name: (optional) Name scope to use for created operations.
 
   Returns:
diff --git a/tensorflow/contrib/distributions/python/ops/laplace.py b/tensorflow/contrib/distributions/python/ops/laplace.py
index ee6aa81c0f4..a03a80d4ece 100644
--- a/tensorflow/contrib/distributions/python/ops/laplace.py
+++ b/tensorflow/contrib/distributions/python/ops/laplace.py
@@ -60,17 +60,17 @@ class Laplace(distribution.Distribution):
     broadcasting (e.g., `loc / scale` is a valid operation).
 
     Args:
-      loc: `float` or `double` tensor which characterizes the location (center)
+      loc: Floating point tensor which characterizes the location (center)
         of the distribution.
-      scale: `float` or `double`, positive-valued tensor which characterzes the
-        spread of the distribution.
+      scale: Positive floating point tensor which characterizes the spread of
+        the distribution.
       validate_args: Whether to validate input with asserts.  If `validate_args`
         is `False`, and the inputs are invalid, correct behavior is not
         guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to give Ops created by the initializer.
 
     Raises:
@@ -294,8 +294,7 @@ class Laplace(distribution.Distribution):
       with ops.op_scope([self._loc, self._scale, n], name):
         n = ops.convert_to_tensor(n)
         n_val = tensor_util.constant_value(n)
-        shape = array_ops.concat(
-            0, [array_ops.pack([n]), self.batch_shape()])
+        shape = array_ops.concat(0, ([n], self.batch_shape()))
         # Sample uniformly-at-random from the open-interval (-1, 1).
         uniform_samples = random_ops.random_uniform(
             shape=shape,
diff --git a/tensorflow/contrib/distributions/python/ops/multinomial.py b/tensorflow/contrib/distributions/python/ops/multinomial.py
new file mode 100644
index 00000000000..477dd06673e
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/multinomial.py
@@ -0,0 +1,343 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Multinomial distribution class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=line-too-long
+
+from tensorflow.contrib.distributions.python.ops import distribution
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+
+# pylint: enable=line-too-long
+
+
+class Multinomial(distribution.Distribution):
+  """Multinomial distribution.
+
+  This distribution is parameterized by a vector `p` of probability
+  parameters for `k` classes and `n`, the counts per each class..
+
+  #### Mathematical details
+
+  The Multinomial is a distribution over k-class count data, meaning
+  for each k-tuple of non-negative integer `counts = [n_1,...,n_k]`, we have a
+  probability of these draws being made from the distribution.  The distribution
+  has hyperparameters `p = (p_1,...,p_k)`, and probability mass
+  function (pmf):
+
+  ```pmf(counts) = n! / (n_1!...n_k!) * (p_1)^n_1*(p_2)^n_2*...(p_k)^n_k```
+
+  where above `n = sum_j n_j`, `n!` is `n` factorial.
+
+  #### Examples
+
+  Create a 3-class distribution, with the 3rd class is most likely to be drawn,
+  using logits..
+
+  ```python
+  logits = [-50., -43, 0]
+  dist = Multinomial(n=4., logits=logits)
+  ```
+
+  Create a 3-class distribution, with the 3rd class is most likely to be drawn.
+
+  ```python
+  p = [.2, .3, .5]
+  dist = Multinomial(n=4., p=p)
+  ```
+
+  The distribution functions can be evaluated on counts.
+
+  ```python
+  # counts same shape as p.
+  counts = [1., 0, 3]
+  dist.prob(counts)  # Shape []
+
+  # p will be broadcast to [[.2, .3, .5], [.2, .3, .5]] to match counts.
+  counts = [[1., 2, 1], [2, 2, 0]]
+  dist.prob(counts)  # Shape [2]
+
+  # p will be broadcast to shape [5, 7, 3] to match counts.
+  counts = [[...]]  # Shape [5, 7, 3]
+  dist.prob(counts)  # Shape [5, 7]
+  ```
+
+  Create a 2-batch of 3-class distributions.
+
+  ```python
+  p = [[.1, .2, .7], [.3, .3, .4]]  # Shape [2, 3]
+  dist = Multinomial(n=[4., 5], p=p)
+
+  counts = [[2., 1, 1], [3, 1, 1]]
+  dist.prob(counts)  # Shape [2]
+  ```
+  """
+
+  def __init__(self,
+               n,
+               logits=None,
+               p=None,
+               validate_args=True,
+               allow_nan_stats=False,
+               name="Multinomial"):
+    """Initialize a batch of Multinomial distributions.
+
+    Args:
+      n:  Non-negative floating point tensor with shape broadcastable to
+        `[N1,..., Nm]` with `m >= 0`. Defines this as a batch of
+        `N1 x ... x Nm` different Multinomial distributions.  Its components
+        should be equal to integer values.
+      logits: Floating point tensor representing the log-odds of a
+        positive event with shape broadcastable to `[N1,..., Nm, k], m >= 0`,
+        and the same dtype as `n`. Defines this as a batch of `N1 x ... x Nm`
+        different `k` class Multinomial distributions.
+      p:  Positive floating point tensor with shape broadcastable to
+        `[N1,..., Nm, k]` `m >= 0` and same dtype as `n`.  Defines this as
+        a batch of `N1 x ... x Nm` different `k` class Multinomial
+        distributions. `p`'s components in the last portion of its shape should
+        sum up to 1.
+      validate_args: Whether to assert valid values for parameters `n` and `p`,
+        and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
+        guaranteed.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
+      name: The name to prefix Ops created by this distribution class.
+
+    Examples:
+
+    ```python
+    # Define 1-batch of 2-class multinomial distribution,
+    # also known as a Binomial distribution.
+    dist = Multinomial(n=2., p=[.1, .9])
+
+    # Define a 2-batch of 3-class distributions.
+    dist = Multinomial(n=[4., 5], p=[[.1, .3, .6], [.4, .05, .55]])
+    ```
+
+    """
+
+    self._logits, self._p = distribution_util.get_logits_and_prob(
+        name=name, logits=logits, p=p, validate_args=validate_args,
+        multidimensional=True)
+    with ops.op_scope([n, self._p], name):
+      with ops.control_dependencies([
+          check_ops.assert_non_negative(
+              n, message="n has negative components."),
+          distribution_util.assert_integer_form(
+              n, message="n has non-integer components."
+          )] if validate_args else []):
+        self._n = array_ops.identity(n, name="convert_n")
+        self._name = name
+
+        self._validate_args = validate_args
+        self._allow_nan_stats = allow_nan_stats
+
+        self._mean = array_ops.expand_dims(n, -1) * self._p
+        # Only used for inferring shape.
+        self._broadcast_shape = math_ops.reduce_sum(self._mean,
+                                                    reduction_indices=[-1],
+                                                    keep_dims=False)
+
+        self._get_batch_shape = self._broadcast_shape.get_shape()
+        self._get_event_shape = (
+            self._mean.get_shape().with_rank_at_least(1)[-1:])
+
+  @property
+  def n(self):
+    """Number of trials."""
+    return self._n
+
+  @property
+  def p(self):
+    """Event probabilities."""
+    return self._p
+
+  @property
+  def logits(self):
+    """Log-odds."""
+    return self._logits
+
+  @property
+  def name(self):
+    """Name to prepend to all ops."""
+    return self._name
+
+  @property
+  def dtype(self):
+    """dtype of samples from this distribution."""
+    return self._p.dtype
+
+  @property
+  def validate_args(self):
+    """Boolean describing behavior on invalid input."""
+    return self._validate_args
+
+  @property
+  def allow_nan_stats(self):
+    """Boolean describing behavior when a stat is undefined for batch member."""
+    return self._allow_nan_stats
+
+  def batch_shape(self, name="batch_shape"):
+    """Batch dimensions of this instance as a 1-D int32 `Tensor`.
+
+    The product of the dimensions of the `batch_shape` is the number of
+    independent distributions of this kind the instance represents.
+
+    Args:
+      name: name to give to the op
+
+    Returns:
+      `Tensor` `batch_shape`
+    """
+    with ops.name_scope(self.name):
+      with ops.op_scope([self._broadcast_shape], name):
+        return array_ops.shape(self._broadcast_shape)
+
+  def get_batch_shape(self):
+    """`TensorShape` available at graph construction time.
+
+    Same meaning as `batch_shape`. May be only partially defined.
+
+    Returns:
+      batch shape
+    """
+    return self._get_batch_shape
+
+  def event_shape(self, name="event_shape"):
+    """Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
+
+    Args:
+      name: name to give to the op
+
+    Returns:
+      `Tensor` `event_shape`
+    """
+    with ops.name_scope(self.name):
+      with ops.op_scope([self._mean], name):
+        return array_ops.gather(array_ops.shape(self._mean),
+                                [array_ops.rank(self._mean) - 1])
+
+  def get_event_shape(self):
+    """`TensorShape` available at graph construction time.
+
+    Same meaning as `event_shape`. May be only partially defined.
+
+    Returns:
+      event shape
+    """
+    return self._get_event_shape
+
+  def mean(self, name="mean"):
+    """Mean of the distribution."""
+    with ops.name_scope(self.name):
+      return array_ops.identity(self._mean, name=name)
+
+  def variance(self, name="variance"):
+    """Variance of the distribution."""
+    with ops.name_scope(self.name):
+      with ops.op_scope([self._n, self._p, self._mean], name):
+        p = array_ops.expand_dims(
+            self._p * array_ops.expand_dims(
+                array_ops.ones_like(self._n), -1), -1)
+        variance = -math_ops.batch_matmul(
+            array_ops.expand_dims(self._mean, -1), p, adj_y=True)
+        variance += array_ops.batch_matrix_diag(self._mean)
+        return variance
+
+  def log_prob(self, counts, name="log_prob"):
+    """`Log(P[counts])`, computed for every batch member.
+
+    For each batch of counts `[n_1,...,n_k]`, `P[counts]` is the probability
+    that after sampling `n` draws from this Multinomial distribution, the
+    number of draws falling in class `j` is `n_j`.  Note that different
+    sequences of draws can result in the same counts, thus the probability
+    includes a combinatorial coefficient.
+
+    Args:
+      counts:  Non-negative tensor with dtype `dtype` and whose shape can
+        be broadcast with `self.p` and `self.n`.  For fixed leading dimensions,
+        the last dimension represents counts for the corresponding Multinomial
+        distribution in `self.p`. `counts` is only legal if it sums up to `n`
+        and its components are equal to integer values.
+      name:  Name to give this Op, defaults to "log_prob".
+
+    Returns:
+      Log probabilities for each record, shape `[N1,...,Nm]`.
+    """
+    n = self._n
+    p = self._p
+    with ops.name_scope(self.name):
+      with ops.op_scope([n, p, counts], name):
+        counts = self._check_counts(counts)
+
+        prob_prob = math_ops.reduce_sum(counts * math_ops.log(self._p),
+                                        reduction_indices=[-1])
+        log_prob = prob_prob + distribution_util.log_combinations(
+            n, counts)
+        return log_prob
+
+  def prob(self, counts, name="prob"):
+    """`P[counts]`, computed for every batch member.
+
+    For each batch of counts `[n_1,...,n_k]`, `P[counts]` is the probability
+    that after sampling `n` draws from this Multinomial distribution, the
+    number of draws falling in class `j` is `n_j`.  Note that different
+    sequences of draws can result in the same counts, thus the probability
+    includes a combinatorial coefficient.
+
+    Args:
+      counts:  Non-negative tensor with dtype `dtype` and whose shape can
+        be broadcast with `self.p` and `self.n`.  For fixed leading dimensions,
+        the last dimension represents counts for the corresponding Multinomial
+        distribution in `self.p`. `counts` is only legal if it sums up to `n`
+        and its components are equal to integer values.
+      name:  Name to give this Op, defaults to "prob".
+
+    Returns:
+      Probabilities for each record, shape `[N1,...,Nm]`.
+    """
+    return super(Multinomial, self).prob(counts, name=name)
+
+  @property
+  def is_continuous(self):
+    return False
+
+  @property
+  def is_reparameterized(self):
+    return False
+
+  def _check_counts(self, counts):
+    """Check counts for proper shape, values, then return tensor version."""
+    counts = ops.convert_to_tensor(counts, name="counts_before_deps")
+    candidate_n = math_ops.reduce_sum(counts, reduction_indices=[-1])
+    if not self.validate_args:
+      return counts
+
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_non_negative(
+            counts, message="counts has negative components."),
+        check_ops.assert_equal(
+            self._n, candidate_n, message="counts do not sum to n."),
+        distribution_util.assert_integer_form(
+            counts, message="counts have non-integer components.")], counts)
diff --git a/tensorflow/contrib/distributions/python/ops/mvn.py b/tensorflow/contrib/distributions/python/ops/mvn.py
index 90e26336d77..8936594dfac 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn.py
@@ -21,9 +21,11 @@ from __future__ import print_function
 import math
 
 from tensorflow.contrib.distributions.python.ops import distribution
+from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
 from tensorflow.contrib.distributions.python.ops import operator_pd_diag
 from tensorflow.contrib.distributions.python.ops import operator_pd_full
+from tensorflow.contrib.distributions.python.ops import operator_pd_vdvt_update
 from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
@@ -40,6 +42,7 @@ __all__ = [
     "MultivariateNormalDiag",
     "MultivariateNormalCholesky",
     "MultivariateNormalFull",
+    "MultivariateNormalDiagPlusVDVT",
 ]
 
 
@@ -52,14 +55,13 @@ class MultivariateNormalOperatorPD(distribution.Distribution):
 
   #### Mathematical details
 
-  The PDF of this distribution is:
+  With `C` the covariance matrix represented by the operator, the PDF of this
+  distribution is:
 
   ```
-  f(x) = (2*pi)^(-k/2) |det(sigma)|^(-1/2) exp(-1/2*(x-mu)^*.sigma^{-1}.(x-mu))
+  f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
   ```
 
-  where `.` denotes the inner product on `R^k` and `^*` denotes transpose.
-
   #### Examples
 
   A single multi-variate Gaussian distribution is defined by a vector of means
@@ -103,16 +105,16 @@ class MultivariateNormalOperatorPD(distribution.Distribution):
     which determines the covariance.
 
     Args:
-      mu: `float` or `double` tensor with shape `[N1,...,Nb, k]`, `b >= 0`.
-      cov: `float` or `double` instance of `OperatorPDBase` with same `dtype`
-        as `mu` and shape `[N1,...,Nb, k, k]`.
+      mu: Floating point tensor with shape `[N1,...,Nb, k]`, `b >= 0`.
+      cov: Instance of `OperatorPDBase` with same `dtype` as `mu` and shape
+        `[N1,...,Nb, k, k]`.
       validate_args: Whether to validate input with asserts.  If `validate_args`
         is `False`, and the inputs are invalid, correct behavior is not
         guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  `Boolean`, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to give Ops created by the initializer.
 
     Raises:
@@ -148,7 +150,7 @@ class MultivariateNormalOperatorPD(distribution.Distribution):
       else:
         return mu
 
-    # Static checks could not be run, so possibly do dyamic checks.
+    # Static checks could not be run, so possibly do dynamic checks.
     if not self.validate_args:
       return mu
     else:
@@ -170,12 +172,12 @@ class MultivariateNormalOperatorPD(distribution.Distribution):
 
   @property
   def validate_args(self):
-    """Boolean describing behavior on invalid input."""
+    """`Boolean` describing behavior on invalid input."""
     return self._validate_args
 
   @property
   def allow_nan_stats(self):
-    """Boolean describing behavior when a stat is undefined for batch member."""
+    """`Boolean` describing behavior when stats are undefined."""
     return self._allow_nan_stats
 
   @property
@@ -417,7 +419,7 @@ class MultivariateNormalDiag(MultivariateNormalOperatorPD):
   determined by `diag_stdev`: `C_{ii} = diag_stdev[i]**2`.
 
   ```
-  f(x) = (2*pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 * (x - mu)^T C^{-1} (x - mu))
+  f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
   ```
 
   #### Examples
@@ -464,17 +466,17 @@ class MultivariateNormalDiag(MultivariateNormalOperatorPD):
     The mean of `X_i` is `mu[i]`, and the standard deviation is `diag_stdev[i]`.
 
     Args:
-      mu:  Rank `N + 1` `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+      mu:  Rank `N + 1` floating point tensor with shape `[N1,...,Nb, k]`,
         `b >= 0`.
       diag_stdev: Rank `N + 1` `Tensor` with same `dtype` and shape as `mu`,
-        representing the standard deviations.
+        representing the standard deviations.  Must be positive.
       validate_args: Whether to validate input with asserts.  If `validate_args`
         is `False`,
         and the inputs are invalid, correct behavior is not guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  `Boolean`, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to give Ops created by the initializer.
 
     Raises:
@@ -487,6 +489,125 @@ class MultivariateNormalDiag(MultivariateNormalOperatorPD):
         name=name)
 
 
+class MultivariateNormalDiagPlusVDVT(MultivariateNormalOperatorPD):
+  """The multivariate normal distribution on `R^k`.
+
+  Every batch member of this distribution is defined by a mean and a lightweight
+  covariance matrix `C`.
+
+  #### Mathematical details
+
+  The PDF of this distribution in terms of the mean `mu` and covariance `C` is:
+
+  ```
+  f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
+  ```
+
+  For every batch member, this distribution represents `k` random variables
+  `(X_1,...,X_k)`, with mean `E[X_i] = mu[i]`, and covariance matrix
+  `C_{ij} := E[(X_i - mu[i])(X_j - mu[j])]`
+
+  The user initializes this class by providing the mean `mu`, and a lightweight
+  definition of `C`:
+
+  ```
+  C = SS^T = SS = (M + V D V^T) (M + V D V^T)
+  M is diagonal (k x k)
+  V = is shape (k x r), typically r << k
+  D = is diagonal (r x r), optional (defaults to identity).
+  ```
+
+  This allows for `O(kr + r^3)` pdf evaluation and determinant, and `O(kr)`
+  sampling and storage (per batch member).
+
+  #### Examples
+
+  A single multi-variate Gaussian distribution is defined by a vector of means
+  of length `k`, and square root of the covariance `S = M + V D V^T`.  Extra
+  leading dimensions, if provided, allow for batches.
+
+  ```python
+  # Initialize a single 3-variate Gaussian with covariance square root
+  # S = M + V D V^T, where V D V^T is a matrix-rank 2 update.
+  mu = [1, 2, 3.]
+  diag_large = [1.1, 2.2, 3.3]
+  v = ... # shape 3 x 2
+  diag_small = [4., 5.]
+  dist = tf.contrib.distributions.MultivariateNormalDiagPlusVDVT(
+      mu, diag_large, v, diag_small=diag_small)
+
+  # Evaluate this on an observation in R^3, returning a scalar.
+  dist.pdf([-1, 0, 1])
+
+  # Initialize a batch of two 3-variate Gaussians.  This time, don't provide
+  # diag_small.  This means S = M + V V^T.
+  mu = [[1, 2, 3], [11, 22, 33]]  # shape 2 x 3
+  diag_large = ... # shape 2 x 3
+  v = ... # shape 2 x 3 x 1, a matrix-rank 1 update.
+  dist = tf.contrib.distributions.MultivariateNormalDiagPlusVDVT(
+      mu, diag_large, v)
+
+  # Evaluate this on a two observations, each in R^3, returning a length two
+  # tensor.
+  x = [[-1, 0, 1], [-11, 0, 11]]  # Shape 2 x 3.
+  dist.pdf(x)
+  ```
+
+  """
+
+  def __init__(
+      self,
+      mu,
+      diag_large,
+      v,
+      diag_small=None,
+      validate_args=True,
+      allow_nan_stats=False,
+      name="MultivariateNormalDiagPlusVDVT"):
+    """Multivariate Normal distributions on `R^k`.
+
+    For every batch member, this distribution represents `k` random variables
+    `(X_1,...,X_k)`, with mean `E[X_i] = mu[i]`, and covariance matrix
+    `C_{ij} := E[(X_i - mu[i])(X_j - mu[j])]`
+
+    The user initializes this class by providing the mean `mu`, and a
+    lightweight definition of `C`:
+
+    ```
+    C = SS^T = SS = (M + V D V^T) (M + V D V^T)
+    M is diagonal (k x k)
+    V = is shape (k x r), typically r << k
+    D = is diagonal (r x r), optional (defaults to identity).
+    ```
+
+    Args:
+      mu:  Rank `n + 1` floating point tensor with shape `[N1,...,Nn, k]`,
+        `n >= 0`.  The means.
+      diag_large:  Optional rank `n + 1` floating point tensor, shape
+        `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `M`.
+      v:  Rank `n + 1` floating point tensor, shape `[N1,...,Nn, k, r]`
+        `n >= 0`.  Defines the matrix `V`.
+      diag_small:  Rank `n + 1` floating point tensor, shape
+        `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `D`.  Default
+        is `None`, which means `D` will be the identity matrix.
+      validate_args: Whether to validate input with asserts.  If `validate_args`
+        is `False`,
+        and the inputs are invalid, correct behavior is not guaranteed.
+      allow_nan_stats:  `Boolean`, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
+      name: The name to give Ops created by the initializer.
+    """
+    m = operator_pd_diag.OperatorPDDiag(diag_large, verify_pd=validate_args)
+    cov = operator_pd_vdvt_update.OperatorPDSqrtVDVTUpdate(
+        m, v, diag=diag_small, verify_pd=validate_args,
+        verify_shapes=validate_args)
+    super(MultivariateNormalDiagPlusVDVT, self).__init__(
+        mu, cov, allow_nan_stats=allow_nan_stats, validate_args=validate_args,
+        name=name)
+
+
 class MultivariateNormalCholesky(MultivariateNormalOperatorPD):
   """The multivariate normal distribution on `R^k`.
 
@@ -496,14 +617,14 @@ class MultivariateNormalCholesky(MultivariateNormalOperatorPD):
 
   #### Mathematical details
 
-  The PDF of this distribution is:
+  The Cholesky factor `chol` defines the covariance matrix: `C = chol chol^T`.
+
+  The PDF of this distribution is then:
 
   ```
-  f(x) = (2*pi)^(-k/2) |det(sigma)|^(-1/2) exp(-1/2*(x-mu)^*.sigma^{-1}.(x-mu))
+  f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
   ```
 
-  where `.` denotes the inner product on `R^k` and `^*` denotes transpose.
-
   #### Examples
 
   A single multi-variate Gaussian distribution is defined by a vector of means
@@ -546,20 +667,21 @@ class MultivariateNormalCholesky(MultivariateNormalOperatorPD):
     """Multivariate Normal distributions on `R^k`.
 
     User must provide means `mu` and `chol` which holds the (batch) Cholesky
-    factors `S`, such that the covariance of each batch member is `S S^*`.
+    factors, such that the covariance of each batch member is `chol chol^T`.
 
     Args:
-      mu: `(N+1)-D`  `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+      mu: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
         `b >= 0`.
       chol: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
-        `[N1,...,Nb, k, k]`.
+        `[N1,...,Nb, k, k]`.  The upper triangular part is ignored (treated as
+        though it is zero), and the diagonal must be positive.
       validate_args: Whether to validate input with asserts.  If `validate_args`
-        is `False`,
-        and the inputs are invalid, correct behavior is not guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+        is `False`, and the inputs are invalid, correct behavior is not
+        guaranteed.
+      allow_nan_stats:  `Boolean`, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to give Ops created by the initializer.
 
     Raises:
@@ -582,14 +704,12 @@ class MultivariateNormalFull(MultivariateNormalOperatorPD):
 
   #### Mathematical details
 
-  The PDF of this distribution is:
+  With `C = sigma`, the PDF of this distribution is:
 
   ```
-  f(x) = (2*pi)^(-k/2) |det(sigma)|^(-1/2) exp(-1/2*(x-mu)^*.sigma^{-1}.(x-mu))
+  f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
   ```
 
-  where `.` denotes the inner product on `R^k` and `^*` denotes transpose.
-
   #### Examples
 
   A single multi-variate Gaussian distribution is defined by a vector of means
@@ -630,17 +750,17 @@ class MultivariateNormalFull(MultivariateNormalOperatorPD):
     User must provide means `mu` and `sigma`, the mean and covariance.
 
     Args:
-      mu: `(N+1)-D`  `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+      mu: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
         `b >= 0`.
       sigma: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
-        `[N1,...,Nb, k, k]`.
+        `[N1,...,Nb, k, k]`.  Each batch member must be positive definite.
       validate_args: Whether to validate input with asserts.  If `validate_args`
         is `False`, and the inputs are invalid, correct behavior is not
         guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      allow_nan_stats:  `Boolean`, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to give Ops created by the initializer.
 
     Raises:
@@ -653,3 +773,72 @@ class MultivariateNormalFull(MultivariateNormalOperatorPD):
         allow_nan_stats=allow_nan_stats,
         validate_args=validate_args,
         name=name)
+
+
+def _kl_mvn_mvn_brute_force(mvn_a, mvn_b, name=None):
+  """Batched KL divergence `KL(mvn_a || mvn_b)` for multivariate normals.
+
+  With `X`, `Y` both multivariate normals in `R^k` with means `mu_x`, `mu_y` and
+  covariance `C_x`, `C_y` respectively,
+
+  ```
+  KL(X || Y) = 0.5 * ( T + Q + - k + L ),
+  T := trace(C_b^{-1} C_a),
+  Q := (mu_b - mu_a)^T C_b^{-1} (mu_b - mu_a),
+  L := Log[Det(C_b)] - Log[Det(C_a)]
+  ```
+
+  This `Op` computes the trace by solving `C_b^{-1} C_a`.  Although efficient
+  methods for solving systems with `C_b` may be available, a dense version of
+  (the square root of) `C_a` is used, so performance is `O(B s k^2)` where `B`
+  is the batch size, and `s` is the cost of solving `C_b x = y` for vectors `x`
+  and `y`.
+
+  Args:
+    mvn_a:  Instance of subclass of `MultivariateNormalOperatorPD`.
+    mvn_b:  Instance of subclass of `MultivariateNormalOperatorPD`.
+    name:  (optional) name to use for created ops.  Default "kl_mvn_mvn".
+
+  Returns:
+    Batchwise `KL(mvn_a || mvn_b)`.
+  """
+  # Access the "private" OperatorPD that each mvn is built from.
+  cov_a = mvn_a._cov  # pylint: disable=protected-access
+  cov_b = mvn_b._cov  # pylint: disable=protected-access
+  mu_a = mvn_a.mu
+  mu_b = mvn_b.mu
+  inputs = [mu_a, mu_b] + cov_a.inputs + cov_b.inputs
+
+  with ops.op_scope(inputs, name, "kl_mvn_mvn"):
+    # If Ca = AA', Cb = BB', then
+    # tr[inv(Cb) Ca] = tr[inv(B)' inv(B) A A']
+    #                = tr[inv(B) A A' inv(B)']
+    #                = tr[(inv(B) A) (inv(B) A)']
+    #                = sum_{ik} (inv(B) A)_{ik}^2
+    # The second equality follows from the cyclic permutation property.
+    b_inv_a = cov_b.sqrt_solve(cov_a.sqrt_to_dense())
+    t = math_ops.reduce_sum(
+        math_ops.square(b_inv_a),
+        reduction_indices=[-1, -2])
+    q = cov_b.inv_quadratic_form_on_vectors(mu_b - mu_a)
+    k = math_ops.cast(cov_a.vector_space_dimension(), mvn_a.dtype)
+    one_half_l = cov_b.sqrt_log_det() - cov_a.sqrt_log_det()
+    return 0.5 * (t + q - k) + one_half_l
+
+
+# Register KL divergences.
+kl_classes = [
+    MultivariateNormalFull,
+    MultivariateNormalCholesky,
+    MultivariateNormalDiag,
+    MultivariateNormalDiagPlusVDVT,
+]
+
+
+for mvn_aa in kl_classes:
+  # Register when they are the same here, and do not register when they are the
+  # same below because that would result in a repeated registration.
+  kullback_leibler.RegisterKL(mvn_aa, mvn_aa)(_kl_mvn_mvn_brute_force)
+  for mvn_bb in kl_classes:
+    if mvn_bb != mvn_aa:
+      kullback_leibler.RegisterKL(mvn_aa, mvn_bb)(_kl_mvn_mvn_brute_force)
diff --git a/tensorflow/contrib/distributions/python/ops/normal.py b/tensorflow/contrib/distributions/python/ops/normal.py
index dff8c7fdbbe..182afa31f7f 100644
--- a/tensorflow/contrib/distributions/python/ops/normal.py
+++ b/tensorflow/contrib/distributions/python/ops/normal.py
@@ -92,15 +92,15 @@ class Normal(distribution.Distribution):
     broadcasting (e.g. `mu + sigma` is a valid operation).
 
     Args:
-      mu: `float` or `double` tensor, the means of the distribution(s).
-      sigma: `float` or `double` tensor, the stddevs of the distribution(s).
+      mu: Floating point tensor, the means of the distribution(s).
+      sigma: Floating point tensor, the stddevs of the distribution(s).
         sigma must contain only positive values.
       validate_args: Whether to assert that `sigma > 0`. If `validate_args` is
-        False, correct output is not guaranteed when input is invalid.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+        `False`, correct output is not guaranteed when input is invalid.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to give Ops created by the initializer.
 
     Raises:
@@ -321,8 +321,7 @@ class Normal(distribution.Distribution):
       with ops.op_scope([self._mu, self._sigma, n], name):
         broadcast_shape = (self._mu + self._sigma).get_shape()
         n = ops.convert_to_tensor(n)
-        shape = array_ops.concat(
-            0, [array_ops.pack([n]), array_ops.shape(self.mean())])
+        shape = array_ops.concat(0, ([n], array_ops.shape(self.mean())))
         sampled = random_ops.random_normal(
             shape=shape, mean=0, stddev=1, dtype=self._mu.dtype, seed=seed)
 
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_diag.py b/tensorflow/contrib/distributions/python/ops/operator_pd_diag.py
index ea5aa3c3866..5e019355f74 100644
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/operator_pd_diag.py
@@ -18,6 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
+import six
+
 from tensorflow.contrib.distributions.python.ops import operator_pd
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -26,11 +29,190 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 
 
-class OperatorPDSqrtDiag(operator_pd.OperatorPDBase):
+@six.add_metaclass(abc.ABCMeta)
+class OperatorPDDiagBase(operator_pd.OperatorPDBase):
+  """Base class for diagonal operators."""
+
+  def __init__(self, diag, verify_pd=True, name='OperatorPDDiagBase'):
+    self._verify_pd = verify_pd
+    self._name = name
+    with ops.name_scope(name):
+      with ops.op_scope([diag], 'init'):
+        self._diag = self._check_diag(diag)
+
+  def _check_diag(self, diag):
+    """Verify that `diag` is positive."""
+    diag = ops.convert_to_tensor(diag, name='diag')
+    if not self.verify_pd:
+      return diag
+    deps = [check_ops.assert_positive(diag)]
+    return control_flow_ops.with_dependencies(deps, diag)
+
+  @property
+  def name(self):
+    """String name identifying this `Operator`."""
+    return self._name
+
+  @property
+  def verify_pd(self):
+    """Whether to verify that this `Operator` is positive definite."""
+    return self._verify_pd
+
+  @property
+  def dtype(self):
+    """Data type of matrix elements of `A`."""
+    return self._diag.dtype
+
+  @property
+  def inputs(self):
+    """Initialization arguments."""
+    return [self._diag]
+
+  def get_shape(self):
+    """`TensorShape` giving static shape."""
+    # If d_shape = [5, 3], we return [5, 3, 3].
+    d_shape = self._diag.get_shape()
+    return d_shape.concatenate(d_shape[-1:])
+
+  def _shape(self):
+    d_shape = array_ops.shape(self._diag)
+    k = array_ops.gather(d_shape, array_ops.size(d_shape) - 1)
+    return array_ops.concat(0, (d_shape, [k]))
+
+  @abc.abstractmethod
+  def _batch_log_det(self):
+    pass
+
+  @abc.abstractmethod
+  def _inv_quadratic_form_on_vectors(self, x):
+    pass
+
+  @abc.abstractmethod
+  def _batch_matmul(self, x, transpose_x=False):
+    pass
+
+  @abc.abstractmethod
+  def _batch_sqrt_matmul(self, x, transpose_x=False):
+    pass
+
+  @abc.abstractmethod
+  def _batch_solve(self, rhs):
+    pass
+
+  @abc.abstractmethod
+  def _batch_sqrt_solve(self, rhs):
+    pass
+
+  @abc.abstractmethod
+  def _to_dense(self):
+    pass
+
+  @abc.abstractmethod
+  def _sqrt_to_dense(self):
+    pass
+
+  @abc.abstractmethod
+  def _add_to_tensor(self, mat):
+    pass
+
+
+class OperatorPDDiag(OperatorPDDiagBase):
   """Class representing a (batch) of positive definite matrices `A`.
 
   This class provides access to functions of a batch of symmetric positive
-  definite (PD) matrices `A` in `R^{k x k}` defined by their their square root,
+  definite (PD) matrices `A` in `R^{k x k}`.
+
+  In this case, `A` is diagonal and is defined by a provided tensor `diag`,
+  `A_{ii} = diag[i]`.
+
+  Determinants, solves, and storage are `O(k)`.
+
+  In practice, this operator represents a (batch) matrix `A` with shape
+  `[N1,...,Nn, k, k]` for some `n >= 0`.  The first `n` indices designate a
+  batch member.  For every batch member `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+  a `k x k` matrix.
+
+  For example,
+
+  ```python
+  distributions = tf.contrib.distributions
+  diag = [1.0, 2.0]
+  operator = OperatorPDDiag(diag)
+  operator.det()  # ==> (1 * 2)
+
+  # Compute the quadratic form x^T A^{-1} x for vector x.
+  x = [1.0, 2.0]
+  operator.inv_quadratic_form_on_vectors(x)
+
+  # Matrix multiplication by the square root, S w, with A = S S^T.
+  # Recall A is diagonal, and so then is S, with  S_{ij} = sqrt(A_{ij}).
+  # If w is iid normal, S w has covariance A.
+  w = [[1.0],
+       [2.0]]
+  operator.sqrt_matmul(w)
+  ```
+
+  The above three methods, `log_det`, `inv_quadratic_form_on_vectors`, and
+  `sqrt_matmul` provide "all" that is necessary to use a covariance matrix
+  in a multi-variate normal distribution.  See the class
+  `MultivariateNormalDiag`.
+  """
+
+  def __init__(self, diag, verify_pd=True, name='OperatorPDDiag'):
+    """Initialize an OperatorPDDiag.
+
+    Args:
+      diag:  Shape `[N1,...,Nn, k]` positive tensor with `n >= 0`, `k >= 1`.
+      verify_pd: Whether to check `diag` is positive.
+      name:  A name to prepend to all ops created by this class.
+    """
+    super(OperatorPDDiag, self).__init__(
+        diag, verify_pd=verify_pd, name=name)
+
+  def _batch_log_det(self):
+    return math_ops.reduce_sum(
+        math_ops.log(self._diag), reduction_indices=[-1])
+
+  def _inv_quadratic_form_on_vectors(self, x):
+    return self._iqfov_via_solve(x)
+
+  def _batch_matmul(self, x, transpose_x=False):
+    if transpose_x:
+      x = array_ops.batch_matrix_transpose(x)
+    diag_mat = array_ops.expand_dims(self._diag, -1)
+    return diag_mat * x
+
+  def _batch_sqrt_matmul(self, x, transpose_x=False):
+    if transpose_x:
+      x = array_ops.batch_matrix_transpose(x)
+    diag_mat = array_ops.expand_dims(self._diag, -1)
+    return math_ops.sqrt(diag_mat) * x
+
+  def _batch_solve(self, rhs):
+    diag_mat = array_ops.expand_dims(self._diag, -1)
+    return rhs / diag_mat
+
+  def _batch_sqrt_solve(self, rhs):
+    diag_mat = array_ops.expand_dims(self._diag, -1)
+    return rhs / math_ops.sqrt(diag_mat)
+
+  def _to_dense(self):
+    return array_ops.batch_matrix_diag(self._diag)
+
+  def _sqrt_to_dense(self):
+    return array_ops.batch_matrix_diag(math_ops.sqrt(self._diag))
+
+  def _add_to_tensor(self, mat):
+    mat_diag = array_ops.batch_matrix_diag_part(mat)
+    new_diag = self._diag + mat_diag
+    return array_ops.batch_matrix_set_diag(mat, new_diag)
+
+
+class OperatorPDSqrtDiag(OperatorPDDiagBase):
+  """Class representing a (batch) of positive definite matrices `A`.
+
+  This class provides access to functions of a batch of symmetric positive
+  definite (PD) matrices `A` in `R^{k x k}` defined by their square root,
   `S`, such that `A = SS^T`.
 
   In this case, `S` is diagonal and is defined by a provided tensor `diag`,
@@ -75,58 +257,17 @@ class OperatorPDSqrtDiag(operator_pd.OperatorPDBase):
       verify_pd: Whether to check `diag` is positive.
       name:  A name to prepend to all ops created by this class.
     """
-    self._verify_pd = verify_pd
-    self._name = name
-    with ops.name_scope(name):
-      with ops.op_scope([diag], 'init'):
-        self._diag = self._check_diag(diag)
-
-  def _check_diag(self, diag):
-    """Verify that `diag` is positive."""
-    diag = ops.convert_to_tensor(diag, name='diag')
-    if not self.verify_pd:
-      return diag
-    deps = [check_ops.assert_positive(diag)]
-    return control_flow_ops.with_dependencies(deps, diag)
-
-  @property
-  def name(self):
-    """String name identifying this `Operator`."""
-    return self._name
-
-  @property
-  def verify_pd(self):
-    """Whether to verify that this `Operator` is positive definite."""
-    return self._verify_pd
-
-  @property
-  def dtype(self):
-    """Data type of matrix elements of `A`."""
-    return self._diag.dtype
+    super(OperatorPDSqrtDiag, self).__init__(
+        diag, verify_pd=verify_pd, name=name)
 
   def _batch_log_det(self):
     return 2 * math_ops.reduce_sum(
         math_ops.log(self._diag), reduction_indices=[-1])
 
-  @property
-  def inputs(self):
-    """List of tensors that were provided as initialization inputs."""
-    return [self._diag]
-
   def _inv_quadratic_form_on_vectors(self, x):
     # This Operator is defined in terms of diagonal entries of the sqrt.
     return self._iqfov_via_sqrt_solve(x)
 
-  def get_shape(self):
-    """`TensorShape` giving static shape."""
-    d_shape = self._diag.get_shape()
-    return d_shape.concatenate(d_shape[-1:])
-
-  def _shape(self):
-    d_shape = array_ops.shape(self._diag)
-    k = array_ops.gather(d_shape, array_ops.size(d_shape) - 1)
-    return array_ops.concat(0, (d_shape, [k]))
-
   def _batch_matmul(self, x, transpose_x=False):
     if transpose_x:
       x = array_ops.batch_matrix_transpose(x)
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py b/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py
new file mode 100644
index 00000000000..f1b750351c7
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py
@@ -0,0 +1,207 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Identity operator in `R^k`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.contrib.distributions.python.ops import operator_pd
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+
+
+class OperatorPDIdentity(operator_pd.OperatorPDBase):
+  """Identity operator in `R^k`:  `Ax = x`.
+
+  This provides an efficient implementation of the identity as an `OperatorPD`.
+  Storage, solves, and matmul are all `O(1)`, independent of batch size.
+
+  In order to be a drop-in replacement for other operators, shape and dtype
+  of arguments (e.g. to `matmul`) are checked statically as though this operator
+  was an instantiated matrix.
+
+  Dynamic shape checks of arguments are not done since that could impede
+  performance.
+  """
+
+  def __init__(self, shape, dtype, verify_pd=True, name='OperatorPDIdentity'):
+    """Initialize an `OperatorPDIdentity`.
+
+    Args:
+      shape:  `int32` rank 1 `Tensor` of length at least 2, and with the last
+        two entries equal (since this is a square matrix).
+      dtype:  Data type of the matrix that this operator represents.
+      verify_pd:  `Boolean`, if `True`, asserts are added to the initialization
+        args to ensure they define this operator as a square (batch) matrix.
+      name:  Name to prepend to `Ops`.
+    """
+
+    # Grab static shape if available now.
+    with ops.name_scope(name):
+      with ops.op_scope([shape], 'init'):
+        self._dtype = dtypes.as_dtype(dtype)
+        self._verify_pd = verify_pd
+        self._name = name
+
+        # Store the static shape (if possible) right now before adding the
+        # asserts, since the asserts prevent .constant_value from working.
+        shape = ops.convert_to_tensor(shape, name='shape')
+        self._get_shape = tensor_shape.TensorShape(
+            tensor_util.constant_value(shape))
+        self._shape_arg = self._check_shape(shape)
+
+  def _check_shape(self, shape):
+    """Check that the init arg `shape` defines a valid operator."""
+    shape = ops.convert_to_tensor(shape, name='shape')
+    if not self._verify_pd:
+      return shape
+
+    # Further checks are equivalent to verification that this is positive
+    # definite.  Why?  Because the further checks simply check that this is a
+    # square matrix, and combining the fact that this is square (and thus maps
+    # a vector space R^k onto itself), with the behavior of .matmul(), this must
+    # be the identity operator.
+    rank = array_ops.size(shape)
+    assert_matrix = check_ops.assert_less_equal(2, rank)
+    with ops.control_dependencies([assert_matrix]):
+      last_dim = array_ops.gather(shape, rank - 1)
+      second_to_last_dim = array_ops.gather(shape, rank - 2)
+      assert_square = check_ops.assert_equal(last_dim, second_to_last_dim)
+      return control_flow_ops.with_dependencies([assert_matrix, assert_square],
+                                                shape)
+
+  def _check_x(self, x):
+    """Static check that the argument `x` is proper `shape`, `dtype`."""
+    # x is a typical argument e.g. to matmul or solve.  In both cases, x should
+    # have the same type/shape since this is a square matrix.  These checks are
+    # ususally not needed since we ususally have some tensor backing this
+    # distribution, and the calls to tf.matmul do a shape/type check.
+    #
+    # Static checks only for efficiency, the identity should be fast.
+    #
+    # Why check at all?  Because we want this operator to be swappable for a
+    # real Operator.
+    if self.dtype != x.dtype:
+      raise TypeError(
+          'Expected argument "x" to have same dtype as this operator (%s).  '
+          'Found: %s' % (self.dtype, x.dtype))
+
+    x_shape = x.get_shape()
+    self_shape = self.get_shape()
+    found_msg = (
+        'Found: operator.shape = %s,  x.shape = %s' % (self_shape, x_shape))
+    if x_shape.ndims is not None and self_shape.ndims is not None:
+      if x_shape.ndims != self_shape.ndims:
+        raise ValueError(
+            'Expected argument "x" to have same tensor rank as this operator.  '
+            + found_msg)
+      if x_shape.is_fully_defined() and self_shape.is_fully_defined():
+        if x_shape[-2] != self_shape[-1]:
+          raise ValueError(
+              'Incompatible shapes for matrix-matrix operation.  ' + found_msg)
+
+  @property
+  def name(self):
+    """String name identifying this `Operator`."""
+    return self._name
+
+  @property
+  def verify_pd(self):
+    """Whether to verify that this `Operator` is positive definite."""
+    return self._verify_pd
+
+  @property
+  def dtype(self):
+    """Data type of matrix elements of `A`."""
+    return self._dtype
+
+  def _add_to_tensor(self, mat):
+    # Add to a tensor in O(k) time!
+    mat_diag = array_ops.batch_matrix_diag_part(mat)
+    new_diag = constant_op.constant(1, dtype=self.dtype) + mat_diag
+    return array_ops.batch_matrix_set_diag(mat, new_diag)
+
+  def _inv_quadratic_form_on_vectors(self, x):
+    self._check_x(x)
+    return self._iqfov_via_sqrt_solve(x)
+
+  @property
+  def inputs(self):
+    """List of tensors that were provided as initialization inputs."""
+    return [self._shape]
+
+  def get_shape(self):
+    """Static `TensorShape` of entire operator.
+
+    If this operator represents the batch matrix `A` with
+    `A.shape = [N1,...,Nn, k, k]`, then this returns
+    `TensorShape([N1,...,Nn, k, k])`
+
+    Returns:
+      `TensorShape`, statically determined, may be undefined.
+    """
+    return self._get_shape
+
+  def _shape(self):
+    return self._shape_arg
+
+  def _det(self):
+    det = array_ops.ones(self.batch_shape(), dtype=self.dtype)
+    det.set_shape(self.get_batch_shape())
+    return det
+
+  def _batch_log_det(self):
+    log_det = array_ops.zeros(self.batch_shape(), dtype=self.dtype)
+    log_det.set_shape(self.get_batch_shape())
+    return log_det
+
+  def _batch_sqrt_log_det(self):
+    s_log_det = array_ops.zeros(self.batch_shape(), dtype=self.dtype)
+    s_log_det.set_shape(self.get_batch_shape())
+    return s_log_det
+
+  def _batch_matmul(self, x, transpose_x=False):
+    if transpose_x:
+      x = array_ops.batch_matrix_transpose(x)
+    self._check_x(x)
+    return x
+
+  def _batch_sqrt_matmul(self, x, transpose_x=False):
+    return self._batch_matmul(x, transpose_x=transpose_x)
+
+  def _batch_solve(self, rhs):
+    self._check_x(rhs)
+    return rhs
+
+  def _batch_sqrt_solve(self, rhs):
+    self._check_x(rhs)
+    return rhs
+
+  def _to_dense(self):
+    diag = array_ops.ones(self.vector_shape(), dtype=self.dtype)
+    dense = array_ops.batch_matrix_diag(diag)
+    dense.set_shape(self.get_shape())
+    return dense
+
+  def _sqrt_to_dense(self):
+    return self.to_dense()
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_vdvt_update.py b/tensorflow/contrib/distributions/python/ops/operator_pd_vdvt_update.py
new file mode 100644
index 00000000000..3c934e721c4
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/operator_pd_vdvt_update.py
@@ -0,0 +1,475 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operator defined: `A = SS^T` where `S = M + VDV^T`, for `OperatorPD` `M`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import operator_pd
+from tensorflow.contrib.distributions.python.ops import operator_pd_diag
+from tensorflow.contrib.distributions.python.ops import operator_pd_identity
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+
+
+class OperatorPDSqrtVDVTUpdate(operator_pd.OperatorPDBase):
+  r"""Operator defined by `A=SS^T`, where `S = M + VDV^T` for `OperatorPD` `M`.
+
+  This provides efficient low-rank updates of arbitrary `OperatorPD`.
+
+  Some math:
+
+  Given positive definite operator representing positive definite (batch) matrix
+  `M` in `R^{k x k}`, diagonal matrix `D` in `R^{r x r}`, and low rank `V` in
+  `R^{k x r}` this class represents the batch matrix `A`, defined by its square
+  root `S` as follows:
+
+  ```
+  A = SS^T, where
+  S := M + VDV^T
+  ```
+
+  Defining an operator in terms of its square root means that
+  `A_{ij} = S_i S_j^T`, where `S_i` is the ith row of `S`.  The update
+  `VDV^T` has `ij` coordinate equal to `sum_k V_{ik} D_{kk} V_{jk}`.
+
+  Computational efficiency:
+
+  Defining `A` via its square root eliminates the need to compute the square
+  root.
+
+  Performance depends on the operator representing `M`, the batch size `B`, and
+  the width of the matrix being multiplied, or systems being solved `L`.
+
+  Since `V` is rank `r`, the update adds
+
+  * `O(B L k r)` to matmul, which requires a call to `M.matmul`.
+  * `O(B L r^3)` to solves, which require a call to `M.solve` as well as the
+    solution to a batch of rank `r` systems.
+  * `O(B r^3)` to determinants, which require a call to `M.solve` as well as the
+    solution to a batch of rank `r` systems.
+
+  The rank `r` solve and determinant are both done through a Cholesky
+  factorization, thus some computation is shared.
+
+  See
+    https://en.wikipedia.org/wiki/Woodbury_matrix_identity
+    https://en.wikipedia.org/wiki/Matrix_determinant_lemma
+  """
+
+  # Note that diag must be nonsingular to use Woodbury lemma, and must be
+  # positive def to use a Cholesky factorization, so we enforce that here.
+  def __init__(self,
+               operator,
+               v,
+               diag=None,
+               verify_pd=True,
+               verify_shapes=True,
+               name='OperatorPDSqrtVDVTUpdate'):
+    """Initialize an `OperatorPDSqrtVDVTUpdate`.
+
+    Args:
+      operator:  Subclass of `OperatorPDBase`.  Represents the (batch) positive
+        definite matrix `M` in `R^{k x k}`.
+      v: `Tensor` defining batch matrix of same `dtype` and `batch_shape` as
+        `operator`, and last two dimensions of shape `(k, r)`.
+      diag:  Optional `Tensor` defining batch vector of same `dtype` and
+        `batch_shape` as `operator`, and last dimension of size `r`.  If `None`,
+        the update becomes `VV^T` rather than `VDV^T`.
+      verify_pd:  `Boolean`.  If `True`, add asserts that `diag > 0`, which,
+        along with the positive definiteness of `operator`, is sufficient to
+        make the resulting operator positive definite.
+      verify_shapes:  `Boolean`.  If `True`, check that `operator`, `v`, and
+        `diag` have compatible shapes.
+      name:  A name to prepend to `Op` names.
+    """
+
+    if not isinstance(operator, operator_pd.OperatorPDBase):
+      raise TypeError('operator was not instance of OperatorPDBase.')
+
+    with ops.name_scope(name):
+      with ops.op_scope(operator.inputs + [v, diag], 'init'):
+        self._operator = operator
+        self._v = ops.convert_to_tensor(v, name='v')
+        self._verify_pd = verify_pd
+        self._verify_shapes = verify_shapes
+        self._name = name
+
+        # This operator will be PD so long as the diag is PSD, but Woodbury
+        # and determinant lemmas require diag to be PD.  So require diag PD
+        # whenever we ask to "verify_pd".
+        if diag is not None:
+          self._diag = ops.convert_to_tensor(diag, name='diag')
+          self._diag_operator = operator_pd_diag.OperatorPDDiag(
+              diag, verify_pd=self.verify_pd)
+          # No need to verify that the inverse of a PD is PD.
+          self._diag_inv_operator = operator_pd_diag.OperatorPDDiag(
+              1 / self._diag, verify_pd=False)
+        else:
+          self._diag = None
+          self._diag_operator = self._get_identity_operator(self._v)
+          self._diag_inv_operator = self._diag_operator
+
+        self._check_types(operator, self._v, self._diag)
+        # Always check static.
+        checked = self._check_shapes_static(operator, self._v, self._diag)
+        if not checked and self._verify_shapes:
+          self._v, self._diag = self._check_shapes_dynamic(
+              operator, self._v, self._diag)
+
+  def _get_identity_operator(self, v):
+    """Get an `OperatorPDIdentity` to play the role of `D` in `VDV^T`."""
+    with ops.op_scope([v], 'get_identity_operator'):
+      if v.get_shape().is_fully_defined():
+        v_shape = v.get_shape().as_list()
+        v_batch_shape = v_shape[:-2]
+        r = v_shape[-1]
+        id_shape = v_batch_shape + [r, r]
+      else:
+        v_shape = array_ops.shape(v)
+        v_rank = array_ops.rank(v)
+        v_batch_shape = array_ops.slice(v_shape, [0], [v_rank - 2])
+        r = array_ops.gather(v_shape, v_rank - 1)  # Last dim of v
+        id_shape = array_ops.concat(0, (v_batch_shape, [r, r]))
+      return operator_pd_identity.OperatorPDIdentity(
+          id_shape, v.dtype, verify_pd=self._verify_pd)
+
+  def _check_types(self, operator, v, diag):
+    def msg():
+      string = (
+          'dtypes must match:  Found operator.dtype = %s, v.dtype = %s'
+          % (operator.dtype, v.dtype))
+      return string
+
+    if operator.dtype != v.dtype:
+      raise TypeError(msg())
+    if diag is not None:
+      if diag.dtype != v.dtype:
+        raise TypeError('%s, diag.dtype = %s' % (msg(), diag.dtype))
+
+  def _check_shapes_static(self, operator, v, diag):
+    """True if they are compatible. Raise if not. False if could not check."""
+    def msg():
+      # Error message when shapes don't match.
+      string = '  Found: operator.shape = %s, v.shape = %s' % (s_op, s_v)
+      if diag is not None:
+        string += ', diag.shape = ' % s_d
+      return string
+
+    s_op = operator.get_shape()
+    s_v = v.get_shape()
+
+    # If everything is not fully defined, return False because we couldn't check
+    if not (s_op.is_fully_defined() and s_v.is_fully_defined()):
+      return False
+    if diag is not None:
+      s_d = diag.get_shape()
+      if not s_d.is_fully_defined():
+        return False
+
+    # Now perform the checks, raising ValueError if they fail.
+
+    # Check tensor rank.
+    if s_v.ndims != s_op.ndims:
+      raise ValueError('v should have same rank as operator' + msg())
+    if diag is not None:
+      if s_d.ndims != s_op.ndims - 1:
+        raise ValueError('diag should have rank 1 less than operator' + msg())
+
+    # Check batch shape
+    if s_v[:-2] != s_op[:-2]:
+      raise ValueError('v and operator should have same batch shape' + msg())
+    if diag is not None:
+      if s_d[:-1] != s_op[:-2]:
+        raise ValueError(
+            'diag and operator should have same batch shape' + msg())
+
+    # Check event shape
+    if s_v[-2] != s_op[-1]:
+      raise ValueError(
+          'v and operator should be compatible for matmul' + msg())
+    if diag is not None:
+      if s_d[-1] != s_v[-1]:
+        raise ValueError('diag and v should have same last dimension' + msg())
+
+    return True
+
+  def _check_shapes_dynamic(self, operator, v, diag):
+    """Return (v, diag) with Assert dependencies, which check shape."""
+    checks = []
+    with ops.op_scope([operator, v, diag], 'check_shapes'):
+      s_v = array_ops.shape(v)
+      r_op = operator.rank()
+      r_v = array_ops.rank(v)
+      if diag is not None:
+        s_d = array_ops.shape(diag)
+        r_d = array_ops.rank(diag)
+
+      # Check tensor rank.
+      checks.append(check_ops.assert_rank(v, r_op))
+      if diag is not None:
+        checks.append(check_ops.assert_rank(diag, r_op - 1))
+
+      # Check batch shape
+      checks.append(check_ops.assert_equal(
+          operator.batch_shape(), array_ops.slice(s_v, [0], [r_v - 2])))
+      if diag is not None:
+        checks.append(check_ops.assert_equal(
+            operator.batch_shape(), array_ops.slice(s_d, [0], [r_d - 1])))
+
+      # Check event shape
+      checks.append(check_ops.assert_equal(
+          operator.vector_space_dimension(), array_ops.gather(s_v, r_v - 2)))
+      if diag is not None:
+        checks.append(check_ops.assert_equal(
+            array_ops.gather(s_v, r_v - 1), array_ops.gather(s_d, r_d - 1)))
+
+      v = control_flow_ops.with_dependencies(checks, v)
+      if diag is not None:
+        diag = control_flow_ops.with_dependencies(checks, diag)
+      return v, diag
+
+  @property
+  def name(self):
+    """String name identifying this `Operator`."""
+    return self._name
+
+  @property
+  def verify_pd(self):
+    """Whether to verify that this `Operator` is positive definite."""
+    return self._verify_pd
+
+  @property
+  def dtype(self):
+    """Data type of matrix elements of `A`."""
+    return self._v.dtype
+
+  def _inv_quadratic_form_on_vectors(self, x):
+    return self._iqfov_via_sqrt_solve(x)
+
+  @property
+  def inputs(self):
+    """List of tensors that were provided as initialization inputs."""
+    return self._operator.inputs + self._diag_operator.inputs + [self._v]
+
+  def get_shape(self):
+    """Static `TensorShape` of entire operator.
+
+    If this operator represents the batch matrix `A` with
+    `A.shape = [N1,...,Nn, k, k]`, then this returns
+    `TensorShape([N1,...,Nn, k, k])`
+
+    Returns:
+      `TensorShape`, statically determined, may be undefined.
+    """
+    return self._operator.get_shape()
+
+  def _shape(self):
+    return self._operator.shape()
+
+  def _det(self):
+    return math_ops.exp(self.log_det())
+
+  def _batch_log_det(self):
+    return 2 * self._batch_sqrt_log_det()
+
+  def _log_det(self):
+    return 2 * self._sqrt_log_det()
+
+  def _sqrt_log_det(self):
+    # The matrix determinant lemma states:
+    # det(M + VDV^T) = det(D^{-1} + V^T M^{-1} V) * det(D) * det(M)
+    #                = det(C) * det(D) * det(M)
+    #
+    # Here we compute the Cholesky factor of "C", then pass the result on.
+    diag_chol_c = array_ops.batch_matrix_diag_part(self._chol_capacitance(
+        batch_mode=False))
+    return self._sqrt_log_det_core(diag_chol_c)
+
+  def _batch_sqrt_log_det(self):
+    # Here we compute the Cholesky factor of "C", then pass the result on.
+    diag_chol_c = array_ops.batch_matrix_diag_part(self._chol_capacitance(
+        batch_mode=True))
+    return self._sqrt_log_det_core(diag_chol_c)
+
+  def _chol_capacitance(self, batch_mode):
+    """Cholesky factorization of the capacitance term."""
+    # Cholesky factor for (D^{-1} + V^T M^{-1} V), which is sometimes
+    # known as the "capacitance" matrix.
+
+    # self._operator will use batch if need be. Automatically.  We cannot force
+    # that here.
+    # M^{-1} V
+    minv_v = self._operator.solve(self._v)
+    # V^T M^{-1} V
+    if batch_mode:
+      vt_minv_v = math_ops.batch_matmul(self._v, minv_v, adj_x=True)
+    else:
+      vt_minv_v = math_ops.matmul(self._v, minv_v, transpose_a=True)
+
+    # D^{-1} + V^T M^{-1} V
+    capacitance = self._diag_inv_operator.add_to_tensor(vt_minv_v)
+    # Cholesky[D^{-1} + V^T M^{-1} V]
+    if batch_mode:
+      return linalg_ops.batch_cholesky(capacitance)
+    else:
+      return linalg_ops.cholesky(capacitance)
+
+  def _sqrt_log_det_core(self, diag_chol_c):
+    """Finish computation of Sqrt[Log[Det]]."""
+    # Complete computation of ._log_det and ._batch_log_det, after the initial
+    # Cholesky factor has been taken with the appropriate batch/non-batch method
+
+    # det(M + VDV^T) = det(D^{-1} + V^T M^{-1} V) * det(D) * det(M)
+    #                = det(C) * det(D) * det(M)
+    # Multiply by 2 here because this is the log-det of the Cholesky factor of C
+    log_det_c = 2 * math_ops.reduce_sum(
+        math_ops.log(diag_chol_c),
+        reduction_indices=[-1])
+    # Add together to get Log[det(M + VDV^T)], the Log-det of the updated square
+    # root.
+    log_det_updated_sqrt = (
+        log_det_c + self._diag_operator.log_det() + self._operator.log_det())
+    return log_det_updated_sqrt
+
+  def _batch_matmul(self, x, transpose_x=False):
+    # Since the square root is PD, it is symmetric, and so A = SS^T = SS.
+    s_x = self._batch_sqrt_matmul(x, transpose_x=transpose_x)
+    return self._batch_sqrt_matmul(s_x)
+
+  def _matmul(self, x, transpose_x=False):
+    # Since the square root is PD, it is symmetric, and so A = SS^T = SS.
+    s_x = self._sqrt_matmul(x, transpose_x=transpose_x)
+    return self._sqrt_matmul(s_x)
+
+  def _batch_sqrt_matmul(self, x, transpose_x=False):
+    v = self._v
+    m = self._operator
+    d = self._diag_operator
+    # The operators call the appropriate matmul/batch_matmul automatically.  We
+    # cannot override.
+    # batch_matmul is defined as:  x * y, so adj_x and adj_y are the ways to
+    # transpose the left and right.
+    mx = m.matmul(x, transpose_x=transpose_x)
+    vt_x = math_ops.batch_matmul(v, x, adj_x=True, adj_y=transpose_x)
+    d_vt_x = d.matmul(vt_x)
+    v_d_vt_x = math_ops.batch_matmul(v, d_vt_x)
+
+    return mx + v_d_vt_x
+
+  def _sqrt_matmul(self, x, transpose_x=False):
+    v = self._v
+    m = self._operator
+    d = self._diag_operator
+    # The operators call the appropriate matmul/batch_matmul automatically.  We
+    # cannot override.
+    # matmul is defined as:  a * b, so transpose_a, transpose_b are used.
+    # transpose the left and right.
+    mx = m.matmul(x, transpose_x=transpose_x)
+    vt_x = math_ops.matmul(v, x, transpose_a=True, transpose_b=transpose_x)
+    d_vt_x = d.matmul(vt_x)
+    v_d_vt_x = math_ops.matmul(v, d_vt_x)
+
+    return mx + v_d_vt_x
+
+  def _solve(self, rhs):
+    # This operator represents A = SS^T, but S is symmetric, so A = SS,
+    # which means A^{-1} = S^{-1}S^{-2}
+    # S^{-1} rhs
+    sqrtinv_rhs = self._sqrt_solve(rhs)
+    return self._sqrt_solve(sqrtinv_rhs)
+
+  def _batch_solve(self, rhs):
+    sqrtinv_rhs = self._batch_sqrt_solve(rhs)
+    return self._batch_sqrt_solve(sqrtinv_rhs)
+
+  def _sqrt_solve(self, rhs):
+    # Recall the square root of this operator is M + VDV^T.
+    # The Woodbury formula gives:
+    # (M + VDV^T)^{-1}
+    # = M^{-1} - M^{-1} V (D^{-1} + V^T M^{-1} V)^{-1} V^T M^{-1}
+    # = M^{-1} - M^{-1} V C^{-1} V^T M^{-1}
+    # where C is the capacitance matrix.
+    # TODO(jvdillon) Determine if recursively applying rank-1 updates is more
+    # efficient.  May not be possible because a general n x n matrix can be
+    # represeneted as n rank-1 updates, and solving with this matrix is always
+    # done in O(n^3) time.
+    m = self._operator
+    v = self._v
+    cchol = self._chol_capacitance(batch_mode=False)
+
+    # The operators will use batch/singleton mode automatically.  We don't
+    # override.
+    # M^{-1} rhs
+    minv_rhs = m.solve(rhs)
+    # V^T M^{-1} rhs
+    vt_minv_rhs = math_ops.matmul(v, minv_rhs, transpose_a=True)
+    # C^{-1} V^T M^{-1} rhs
+    cinv_vt_minv_rhs = linalg_ops.cholesky_solve(cchol, vt_minv_rhs)
+    # V C^{-1} V^T M^{-1} rhs
+    v_cinv_vt_minv_rhs = math_ops.matmul(v, cinv_vt_minv_rhs)
+    # M^{-1} V C^{-1} V^T M^{-1} rhs
+    minv_v_cinv_vt_minv_rhs = m.solve(v_cinv_vt_minv_rhs)
+
+    # M^{-1} - M^{-1} V C^{-1} V^T M^{-1}
+    return minv_rhs - minv_v_cinv_vt_minv_rhs
+
+  def _batch_sqrt_solve(self, rhs):
+    # Recall the square root of this operator is M + VDV^T.
+    # The Woodbury formula gives:
+    # (M + VDV^T)^{-1}
+    # = M^{-1} - M^{-1} V (D^{-1} + V^T M^{-1} V)^{-1} V^T M^{-1}
+    # = M^{-1} - M^{-1} V C^{-1} V^T M^{-1}
+    # where C is the capacitance matrix.
+    m = self._operator
+    v = self._v
+    cchol = self._chol_capacitance(batch_mode=True)
+
+    # The operators will use batch/singleton mode automatically.  We don't
+    # override.
+    # M^{-1} rhs
+    minv_rhs = m.solve(rhs)
+    # V^T M^{-1} rhs
+    vt_minv_rhs = math_ops.batch_matmul(v, minv_rhs, adj_x=True)
+    # C^{-1} V^T M^{-1} rhs
+    cinv_vt_minv_rhs = linalg_ops.batch_cholesky_solve(cchol, vt_minv_rhs)
+    # V C^{-1} V^T M^{-1} rhs
+    v_cinv_vt_minv_rhs = math_ops.batch_matmul(v, cinv_vt_minv_rhs)
+    # M^{-1} V C^{-1} V^T M^{-1} rhs
+    minv_v_cinv_vt_minv_rhs = m.solve(v_cinv_vt_minv_rhs)
+
+    # M^{-1} - M^{-1} V C^{-1} V^T M^{-1}
+    return minv_rhs - minv_v_cinv_vt_minv_rhs
+
+  def _to_dense(self):
+    sqrt = self.sqrt_to_dense()
+    return math_ops.batch_matmul(sqrt, sqrt, adj_y=True)
+
+  def _sqrt_to_dense(self):
+    v = self._v
+    d = self._diag_operator
+    m = self._operator
+
+    d_vt = d.matmul(v, transpose_x=True)
+    # Batch op won't be efficient for singletons.  Currently we don't break
+    # to_dense into batch/singleton methods.
+    v_d_vt = math_ops.batch_matmul(v, d_vt)
+    m_plus_v_d_vt = m.to_dense() + v_d_vt
+    return m_plus_v_d_vt
diff --git a/tensorflow/contrib/distributions/python/ops/shape.py b/tensorflow/contrib/distributions/python/ops/shape.py
new file mode 100644
index 00000000000..2030ce22cd5
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/shape.py
@@ -0,0 +1,396 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A helper class for inferring Distribution shape."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+
+
+class _ShapeUtil(object):
+  """Class which helps infer/identify subsets of tensor dimensions.
+
+  Terminology:
+    Recall that a `Tensor` has:
+      shape: sizes of tensor dimensions,
+      ndims: size of shape; number of tensor dimensions,
+       dims: indexes into shape; useful for transpose, reduce.
+
+    Tensors sampled from a `Distribution` can be partitioned by:
+      sample dims: indexes independent, identically distributed (iid) draws,
+      batch dims:  indexes non-identical draws,
+      event dims:  indexes coordinates of a single draw.
+
+    The sample, batch, and event dimensions constitute the entirety of a
+    `Tensor` shape.  The dimensions are always in sample, batch, event order.
+
+  Assumptions:
+    We assume that batch_ndims and event_ndims are statically known for both
+    creating this object and for inputs to its functions.
+    TODO(jvdillon): Relax this assumption and support fully unknown shape.
+
+    We also assume that the `Tensor` rank is static, i.e., `x.get_shape().ndims
+    is not None`.
+
+  Possible use-cases:
+    ~ Sample dimensions:
+      Computing summary statistics, i.e., the average is a reduction over sample
+      dimensions.
+
+    ~ Batch dimensions:
+      Log-likelihood under model predicted location:
+      ```python
+      mu = ... # vector of predictions, one for each covariate.
+      neg_log_likelihood = -tf.reduce_mean(
+        Normal(loc=mu, scale=1).log_pdf(x),
+        reduce_dims=[0])
+      ```
+
+      Monte Carlo estimation of a marginal probability:
+      Average over batch dimensions where batch dimensions are associated with
+      random draws of a prior.
+      E.g., suppose we want to find the Monte Carlo estimate of the marginal
+      distribution of a Normal with a random Laplace location:
+      ```
+        P(X=x) = integral P(X=x|y) P(Y=y) dy
+              ~= 1/n sum_{i=1}^n P(X=x|y_i),   y_i ~iid Laplace(0,1)
+               = tf.reduce_mean(Normal(loc=Laplace(0, 1).sample_n(n=1000),
+                                       scale=tf.ones([1000, 1])).pdf(x),
+                                reduce_dims=[0])
+      ```
+
+      The `Laplace` distribution generates a tensor of shape [1000, 1]. When fed
+      to a `Normal`, this is interpreted as 1000 different locations, i.e.,
+      1000 non-identical Normals.  Therefore a single call to pdf(x) yields 1000
+      probabilities, one for every location.  The average over this batch yields
+      the marginal.
+
+    ~ Event dimensions:
+      Computing the determinant of the Jacobian of a function of a random
+      variable involves a reduction over event dimensions.
+
+  Examples:
+    Write S, B, E for sample shape, batch shape, and event shape (resp.).
+
+    ```python
+    x.get_shape() == S + B + E  # For statically known x shape.
+
+    # 100 iid samples from one multivariate Normal with two
+    # degrees of freedom (DF).
+    mu = [0., 0]
+    sigma = [[1., 0],
+             [0,  1]]
+    X = MultivariateNormal(loc=mu, scale=sigma).sample_n(n=100)
+    # S = [100]
+    # B = []
+    # E = [2]
+
+    # 100 iid samples from one Wishart with 2x2 DF.
+    sigma = [[1., 0],
+             [0,  1]]
+    X = Wishart(scale=sigma).sample_n(n=100)
+    # S = [100]
+    # B = []
+    # E = [2, 2]
+
+    # 100 iid samples (with shape [2, 50]) from two, non-identical bivariate
+    # Normal distributions.
+    mu    = ... # shape(2, 2)
+    sigma = ... # shape(2, 2, 2)
+    X = MultivariateNormal(loc=mu, scale=sigma).sample(shape=[2, 50])
+    # S = [2, 50]
+    # B = [2]
+    # E = [2]
+    ```
+
+  """
+
+  def __init__(self, batch_ndims=None, event_ndims=None, name='ShapeUtil'):
+    """Construct ShapeUtil with known sample, batch, and/or event ndims.
+
+    Typically, batch_ndims and event_ndims are fixed throughout the lifetime of
+    a Distribution.
+
+    Args:
+      batch_ndims: number of dims (rank) of the batch portion of indexes of a
+        `Tensor`.  A "batch" is a non-identical distribution, i.e, Normal with
+        different parameters.
+      event_ndims: number of dims (rank) of the event portion of indexes of a
+        `Tensor`. An "event" is what is sampled from a distribution, i.e., a
+        trivariate Normal has an event shape of [3] and a 4 dimensional Wishart
+        has an event shape of [4, 4].
+      name: `String`. The name to give Ops created by this class.
+
+    Raises:
+      ValueError: if batch_ndims or event_ndims are invalid.
+    """
+    if batch_ndims < 0:
+      raise ValueError('must specify non-negative batch_ndims(%d)', batch_ndims)
+    if batch_ndims > 0 and event_ndims < 1:
+      raise ValueError('must specify positive event_ndims(%d) when '
+                       'batch_ndims(%d) is positive', event_ndims, batch_ndims)
+    # TODO(jvdillon): Support batches of scalars.
+    self._name = name
+    self._batch_ndims = batch_ndims
+    self._event_ndims = event_ndims
+
+  @property
+  def name(self):
+    """Name given to ops created by this class."""
+    return self._name
+
+  @property
+  def batch_ndims(self):
+    """Returns number of dimensions corresponding to non-identical draws."""
+    return self._batch_ndims
+
+  @property
+  def event_ndims(self):
+    """Returns number of dimensions needed to index a sample's coordinates."""
+    return self._event_ndims
+
+  def get_ndims(self, x, name='get_ndims'):
+    """Get tensor ndims (rank).
+
+    Args:
+      x: `Tensor`.
+      name: `String`. The name to give this op.
+
+    Raises:
+      ValueError: if ndims is not statically known.
+
+    Returns:
+      `Scalar` number of dimensions associated with a `Tensor`.
+    """
+    if x is None:
+      raise ValueError('Input was None which does not have known ndims.')
+    with ops.name_scope(self.name):
+      with ops.op_scope([x], name):
+        ndims = ops.convert_to_tensor(x).get_shape().ndims
+        if ndims is None:
+          raise ValueError('ShapeUtil assumes static number of '
+                           'dimensions(%d)', ndims)
+        return ndims
+
+  def get_sample_ndims(self, x):
+    """Returns number of dimensions corresponding to iid draws.
+
+    Args:
+      x: `Tensor`.
+
+    Raises:
+      ValueError: if batch_ndims or event_ndims are not statically known.
+      ValueError: if static sample_ndims does not match inferred
+
+    Returns:
+      Scalar number of dimensions associated with a sample.
+    """
+    ndims = self.get_ndims(x)
+    sample_ndims = ndims - self.batch_ndims - self.event_ndims
+    if sample_ndims < 0:
+      raise ValueError('expected batch_ndims(%d) + event_ndims(%d) < ndims(%d)',
+                       self.batch_ndims, self.event_ndims, ndims)
+    return sample_ndims
+
+  def get_dims(self, x, sample=True, batch=True, event=True):
+    """Returns subset of tensor's dimension indexes (indexes into shape).
+
+    Args:
+      x: `Tensor`.
+      sample: `Boolean`. Include sample dimensions or not.
+      batch: `Boolean`. Include batch dimensions or not.
+      event: `Boolean`. Include event dimensions or not.
+
+    Raises:
+      ValueError: if `x.get_shape().ndims` is `None`
+
+    Returns:
+      List enumerating requested dimensions.
+    """
+    ndims = self.get_ndims(x)
+
+    if sample and batch and event:
+      return list(range(ndims))
+
+    sample_start = 0
+    batch_start = self.get_sample_ndims(x)
+    event_start = batch_start + self.batch_ndims
+
+    sample_shape = list(range(sample_start, batch_start)) if sample else []
+    batch_shape = list(range(batch_start, event_start)) if batch else []
+    event_shape = list(range(event_start, ndims)) if event else []
+
+    return sample_shape + batch_shape + event_shape
+
+  def get_shape(self, x, sample=True, batch=True, event=True, name='get_shape'):
+    """Returns subset of tensor's shape (size of dimensions).
+
+    Args:
+      x: `Tensor`.
+      sample: `Boolean`. Include sample shape or not.
+      batch: `Boolean`. Include batch shape or not.
+      event: `Boolean`. Include event shape or not.
+      name: `String`. The name to give this op.
+
+    Raises:
+      ValueError: if `x.get_shape().ndims` is `None`
+
+    Returns:
+      List describing event shape if known statically, `Tensor` otherwise.
+    """
+    if not sample and not batch and not event:
+      return []
+    with ops.name_scope(self._name):
+      with ops.op_scope([x], name):
+        x = ops.convert_to_tensor(x)
+        shape = (x.get_shape().as_list()
+                 if x.get_shape().is_fully_defined()
+                 else array_ops.shape(x))
+
+        if sample and batch and event:
+          return shape
+
+        sample_start = 0
+        batch_start = self.get_sample_ndims(x)
+        event_start = batch_start + self.batch_ndims
+
+        sample_shape = shape[sample_start:batch_start] if sample else []
+        batch_shape = shape[batch_start:event_start] if batch else []
+        event_shape = shape[event_start:] if event else []
+
+        if not batch and not event:
+          return sample_shape
+        if not sample and not event:
+          return batch_shape
+        if not sample and not batch:
+          return event_shape
+
+        if x.get_shape().is_fully_defined():
+          return sample_shape + batch_shape + event_shape
+        else:
+          return array_ops.concat(0, [sample_shape, batch_shape, event_shape])
+
+  def get_sample_dims(self, x):
+    """Returns dimension indexes corresponding to sample.
+
+    Convenience function; identical to:
+
+    ```python
+    get_dims(x, sample=True, batch=False, event=False)
+    ```
+
+    Args:
+      x: `Tensor`.
+
+    Raises:
+      ValueError: if `x.get_shape().ndims` is `None`
+
+    Returns:
+      List enumerating sample dimensions.
+    """
+    return self.get_dims(x, sample=True, batch=False, event=False)
+
+  def get_batch_dims(self, x):
+    """Returns dimension indexes corresponding to batch.
+
+    Convenience function; identical to:
+
+    ```python
+    get_dims(x, sample=False, batch=True, event=False)
+    ```
+
+    Args:
+      x: `Tensor`.
+
+    Raises:
+      ValueError: if `x.get_shape().ndims` is `None`
+
+    Returns:
+      List enumerating batch dimensions.
+    """
+    return self.get_dims(x, sample=False, batch=True, event=False)
+
+  def get_event_dims(self, x):
+    """Returns dimension indexes corresponding to event.
+
+    Convenience function; identical to:
+
+    ```python
+    get_dims(x, sample=False, batch=False, event=True)
+    ```
+
+    Args:
+      x: `Tensor`.
+
+    Raises:
+      ValueError: if `x.get_shape().ndims` is `None`
+
+    Returns:
+      List enumerating event dimensions.
+    """
+    return self.get_dims(x, sample=False, batch=False, event=True)
+
+  def get_sample_shape(self, x):
+    """Returns shape corresponding to sample.
+
+    Convenience function; identical to:
+
+    ```python
+    get_shape(x, sample=True, batch=False, event=False)
+    ```
+
+    Args:
+      x: `Tensor`.
+
+    Returns:
+      List describing sample shape if known statically, `Tensor` otherwise.
+    """
+    return self.get_shape(x, sample=True, batch=False, event=False)
+
+  def get_batch_shape(self, x):
+    """Returns shape corresponding to batch.
+
+    Convenience function; identical to:
+
+    ```python
+    get_shape(x, sample=False, batch=True, event=False)
+    ```
+
+    Args:
+      x: `Tensor`.
+
+    Returns:
+      List describing batch shape if known statically, `Tensor` otherwise.
+    """
+    return self.get_shape(x, sample=False, batch=True, event=False)
+
+  def get_event_shape(self, x):
+    """Returns shape corresponding to event.
+
+    Convenience function; identical to:
+
+    ```python
+    get_shape(x, sample=False, batch=False, event=True)
+    ```
+
+    Args:
+      x: `Tensor`.
+
+    Returns:
+      List describing event shape if known statically, `Tensor` otherwise.
+    """
+    return self.get_shape(x, sample=False, batch=False, event=True)
diff --git a/tensorflow/contrib/distributions/python/ops/student_t.py b/tensorflow/contrib/distributions/python/ops/student_t.py
index e5fa624ddc4..8e43c95b6db 100644
--- a/tensorflow/contrib/distributions/python/ops/student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/student_t.py
@@ -82,6 +82,7 @@ class StudentT(distribution.Distribution):
   # returning a length 2 tensor.
   dist.pdf(3.0)
   ```
+
   """
 
   def __init__(self,
@@ -99,19 +100,19 @@ class StudentT(distribution.Distribution):
     broadcasting (e.g. `df + mu + sigma` is a valid operation).
 
     Args:
-      df: `float` or `double` tensor, the degrees of freedom of the
+      df: Floating point tensor, the degrees of freedom of the
         distribution(s). `df` must contain only positive values.
-      mu: `float` or `double` tensor, the means of the distribution(s).
-      sigma: `float` or `double` tensor, the scaling factor for the
+      mu: Floating point tensor, the means of the distribution(s).
+      sigma: Floating point tensor, the scaling factor for the
         distribution(s). `sigma` must contain only positive values.
         Note that `sigma` is not the standard deviation of this distribution.
       validate_args: Whether to assert that `df > 0, sigma > 0`. If
-        `validate_args` is False and inputs are invalid, correct behavior is not
-        guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+        `validate_args` is `False` and inputs are invalid, correct behavior is
+        not guaranteed.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to give Ops created by the initializer.
 
     Raises:
@@ -185,9 +186,12 @@ class StudentT(distribution.Distribution):
           nan = np.nan + self._zeros()
           return math_ops.select(df_gt_1, result_if_defined, nan)
         else:
-          one = ops.convert_to_tensor(1.0, dtype=self.dtype)
+          one = constant_op.constant(1.0, dtype=self.dtype)
           return control_flow_ops.with_dependencies(
-              [check_ops.assert_less(one, self._df)], result_if_defined)
+              [check_ops.assert_less(
+                  one, self._df,
+                  message="mean not defined for components of df <= 1"
+              )], result_if_defined)
 
   def mode(self, name="mode"):
     with ops.name_scope(self.name):
@@ -232,9 +236,12 @@ class StudentT(distribution.Distribution):
               result_where_defined,
               self._zeros() + np.nan)
         else:
-          one = ops.convert_to_tensor(1.0, self.dtype)
+          one = constant_op.constant(1.0, dtype=self.dtype)
           return control_flow_ops.with_dependencies(
-              [check_ops.assert_less(one, self._df)], result_where_defined)
+              [check_ops.assert_less(
+                  one, self._df,
+                  message="variance not defined for components of df <= 1"
+              )], result_where_defined)
 
   def std(self, name="std"):
     with ops.name_scope(self.name):
@@ -348,8 +355,7 @@ class StudentT(distribution.Distribution):
         # Let X = R*cos(theta), and let Y = R*sin(theta).
         # Then X ~ t_df and Y ~ t_df.
         # The variates X and Y are not independent.
-        shape = array_ops.concat(0, [array_ops.pack([2, n]),
-                                     self.batch_shape()])
+        shape = array_ops.concat(0, ([2, n], self.batch_shape()))
         uniform = random_ops.random_uniform(shape=shape,
                                             dtype=self.dtype,
                                             seed=seed)
diff --git a/tensorflow/contrib/distributions/python/ops/transformed_distribution.py b/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
index 185741b2176..82971301560 100644
--- a/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
@@ -57,6 +57,7 @@ class TransformedDistribution(distribution.Distribution):
     name="LogitNormalTransformedDistribution"
   )
   ```
+
   """
 
   def __init__(self,
diff --git a/tensorflow/contrib/distributions/python/ops/uniform.py b/tensorflow/contrib/distributions/python/ops/uniform.py
index eb196a3ea91..09437d36d16 100644
--- a/tensorflow/contrib/distributions/python/ops/uniform.py
+++ b/tensorflow/contrib/distributions/python/ops/uniform.py
@@ -67,14 +67,14 @@ class Uniform(distribution.Distribution):
     ```
 
     Args:
-      a: `float` or `double` tensor, the minimum endpoint.
-      b: `float` or `double` tensor, the maximum endpoint. Must be > `a`.
-      validate_args: Whether to assert that `a > b`. If `validate_args` is False
-        and inputs are invalid, correct behavior is not guaranteed.
-      allow_nan_stats:  Boolean, default False.  If False, raise an exception if
-        a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-        If True, batch members with valid parameters leading to undefined
-        statistics will return NaN for this statistic.
+      a: Floating point tensor, the minimum endpoint.
+      b: Floating point tensor, the maximum endpoint. Must be > `a`.
+      validate_args: Whether to assert that `a > b`. If `validate_args` is
+        `False` and inputs are invalid, correct behavior is not guaranteed.
+      allow_nan_stats:  Boolean, default `False`.  If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member.  If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
       name: The name to prefix Ops created by this distribution class.
 
     Raises:
@@ -83,8 +83,9 @@ class Uniform(distribution.Distribution):
     self._allow_nan_stats = allow_nan_stats
     self._validate_args = validate_args
     with ops.op_scope([a, b], name):
-      with ops.control_dependencies([check_ops.assert_less(a, b)] if
-                                    validate_args else []):
+      with ops.control_dependencies([check_ops.assert_less(
+          a, b, message="uniform not defined when a > b.")] if validate_args
+                                    else []):
         a = array_ops.identity(a, name="a")
         b = array_ops.identity(b, name="b")
 
@@ -228,7 +229,7 @@ class Uniform(distribution.Distribution):
         n = ops.convert_to_tensor(n, name="n")
         n_val = tensor_util.constant_value(n)
 
-        shape = array_ops.concat(0, [array_ops.pack([n]), self.batch_shape()])
+        shape = array_ops.concat(0, ([n], self.batch_shape()))
         samples = random_ops.random_uniform(shape=shape,
                                             dtype=self.dtype,
                                             seed=seed)
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index 2e7b547b308..14c7258c4a4 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -94,6 +94,30 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "gmm_test",
+    srcs = [
+        "python/ops/gmm_test.py",
+    ],
+    additional_deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+tf_py_test(
+    name = "gmm_ops_test",
+    srcs = [
+        "python/ops/gmm_ops_test.py",
+    ],
+    additional_deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_py_test(
     name = "factorization_ops_test",
     srcs = ["python/ops/factorization_ops_test.py"],
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
index 5a6bbec4b0d..655fb57a3ec 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
@@ -304,7 +304,7 @@ class WalsModelTest(tf.test.TestCase):
       col_factors2 = [x.eval() for x in wals_model.col_factors]
 
       for c1, c2 in zip(col_factors1, col_factors2):
-        self.assertAllClose(c1, c2, atol=1e-3)
+        self.assertAllClose(c1, c2, rtol=5e-3, atol=1e-2)
 
   def test_als_transposed(self):
     with self.test_session():
@@ -383,7 +383,7 @@ class WalsModelTest(tf.test.TestCase):
                                           regularization=1e-5,
                                           row_weights=None,
                                           col_weights=None)
-      self.simple_train(model, inp, 15)
+      self.simple_train(model, inp, 25)
       row_factor = model.row_factors[0].eval()
       col_factor = model.col_factors[0].eval()
       self.assertAllClose(data,
@@ -407,7 +407,7 @@ class WalsModelTest(tf.test.TestCase):
                                           regularization=1e-5,
                                           row_weights=[0] * rows,
                                           col_weights=[0] * cols)
-      self.simple_train(model, inp, 15)
+      self.simple_train(model, inp, 25)
       row_factor = model.row_factors[0].eval()
       col_factor = model.col_factors[0].eval()
       self.assertAllClose(data,
@@ -438,7 +438,7 @@ class WalsModelTest(tf.test.TestCase):
                                           regularization=0.001,
                                           row_weights=row_wts,
                                           col_weights=col_wts)
-      self.simple_train(model, inp, 10)
+      self.simple_train(model, inp, 25)
       row_factor = model.row_factors[0].eval()
       col_factor = model.col_factors[0].eval()
       out = np.dot(row_factor, np.transpose(col_factor))
@@ -446,7 +446,7 @@ class WalsModelTest(tf.test.TestCase):
         for j in xrange(cols):
           if keep_index([i, j]):
             self.assertNear(data[i][j], out[i][j],
-                            err=0.2, msg="%d, %d" % (i, j))
+                            err=0.4, msg="%d, %d" % (i, j))
           else:
             self.assertNear(0, out[i][j], err=0.5, msg="%d, %d" % (i, j))
 
diff --git a/tensorflow/contrib/factorization/python/ops/gmm.py b/tensorflow/contrib/factorization/python/ops/gmm.py
new file mode 100644
index 00000000000..c23a8cb30ed
--- /dev/null
+++ b/tensorflow/contrib/factorization/python/ops/gmm.py
@@ -0,0 +1,211 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Implementation of Gaussian mixture model (GMM) clustering.
+
+This goes on top of skflow API.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+import tensorflow as tf
+
+from tensorflow.contrib.factorization.python.ops import gmm_ops
+from tensorflow.contrib.learn.python.learn.estimators import estimator
+from tensorflow.contrib.learn.python.learn.estimators._sklearn import TransformerMixin
+from tensorflow.contrib.learn.python.learn.learn_io import data_feeder
+from tensorflow.contrib.learn.python.learn.utils import checkpoints
+from tensorflow.python.ops.control_flow_ops import with_dependencies
+
+
+class GMM(estimator.Estimator, TransformerMixin):
+  """GMM clustering."""
+  SCORES = 'scores'
+  ASSIGNMENTS = 'assignments'
+  ALL_SCORES = 'all_scores'
+
+  def __init__(self,
+               num_clusters,
+               model_dir=None,
+               random_seed=0,
+               params='wmc',
+               initial_clusters='random',
+               covariance_type='full',
+               batch_size=128,
+               steps=10,
+               continue_training=False,
+               config=None,
+               verbose=1):
+    """Creates a model for running GMM training and inference.
+
+    Args:
+      num_clusters: number of clusters to train.
+      model_dir: the directory to save the model results and log files.
+      random_seed: Python integer. Seed for PRNG used to initialize centers.
+      params: Controls which parameters are updated in the training process.
+        Can contain any combination of "w" for weights, "m" for means,
+        and "c" for covars.
+      initial_clusters: specifies how to initialize the clusters for training.
+        See gmm_ops.gmm for the possible values.
+      covariance_type: one of "full", "diag".
+      batch_size: See TensorFlowEstimator
+      steps: See TensorFlowEstimator
+      continue_training: See TensorFlowEstimator
+      config: See TensorFlowEstimator
+      verbose: See TensorFlowEstimator
+    """
+    super(GMM, self).__init__(
+        model_dir=model_dir,
+        config=config)
+    self.batch_size = batch_size
+    self.steps = steps
+    self.continue_training = continue_training
+    self.verbose = verbose
+    self._num_clusters = num_clusters
+    self._params = params
+    self._training_initial_clusters = initial_clusters
+    self._covariance_type = covariance_type
+    self._training_graph = None
+    self._random_seed = random_seed
+
+  def fit(self, x, y=None, monitors=None, logdir=None, steps=None):
+    """Trains a GMM clustering on x.
+
+    Note: See TensorFlowEstimator for logic for continuous training and graph
+      construction across multiple calls to fit.
+
+    Args:
+      x: training input matrix of shape [n_samples, n_features].
+      y: labels. Should be None.
+      monitors: List of `Monitor` objects to print training progress and
+        invoke early stopping.
+      logdir: the directory to save the log file that can be used for optional
+        visualization.
+      steps: number of training steps. If not None, overrides the value passed
+        in constructor.
+
+    Returns:
+      Returns self.
+    """
+    if logdir is not None:
+      self._model_dir = logdir
+    self._data_feeder = data_feeder.setup_train_data_feeder(
+        x, None, self._num_clusters, self.batch_size)
+    self._train_model(input_fn=self._data_feeder.input_builder,
+                      feed_fn=self._data_feeder.get_feed_dict_fn(),
+                      steps=steps or self.steps,
+                      monitors=monitors,
+                      init_feed_fn=self._data_feeder.get_feed_dict_fn())
+    return self
+
+  def predict(self, x, batch_size=None):
+    """Predict cluster id for each element in x.
+
+    Args:
+      x: 2-D matrix or iterator.
+      batch_size: size to use for batching up x for querying the model.
+
+    Returns:
+      Array with same number of rows as x, containing cluster ids.
+    """
+    return super(GMM, self).predict(x=x, batch_size=batch_size)[GMM.ASSIGNMENTS]
+
+  def score(self, x, batch_size=None):
+    """Predict total sum of distances to nearest clusters.
+
+    Args:
+      x: 2-D matrix or iterator.
+      batch_size: size to use for batching up x for querying the model.
+
+    Returns:
+      Total score.
+    """
+    return np.sum(self.evaluate(x=x, batch_size=batch_size)[GMM.SCORES])
+
+  def transform(self, x, batch_size=None):
+    """Transforms each element in x to distances to cluster centers.
+
+    Args:
+      x: 2-D matrix or iterator.
+      batch_size: size to use for batching up x for querying the model.
+
+    Returns:
+      Array with same number of rows as x, and num_clusters columns, containing
+      distances to the cluster centers.
+    """
+    return super(GMM, self).predict(x=x, batch_size=batch_size)[GMM.ALL_SCORES]
+
+  def clusters(self):
+    """Returns cluster centers."""
+    clusters = checkpoints.load_variable(self.model_dir,
+                                         gmm_ops.GmmAlgorithm.CLUSTERS_VARIABLE)
+    return np.squeeze(clusters, 1)
+
+  def covariances(self):
+    """Returns the covariances."""
+    return checkpoints.load_variable(
+        self.model_dir,
+        gmm_ops.GmmAlgorithm.CLUSTERS_COVS_VARIABLE)
+
+  def _get_train_ops(self, features, _):
+    (_,
+     _,
+     losses,
+     training_op) = gmm_ops.gmm(
+         features,
+         self._training_initial_clusters,
+         self._num_clusters,
+         self._random_seed,
+         self._covariance_type,
+         self._params)
+    incr_step = tf.assign_add(tf.contrib.framework.get_global_step(), 1)
+    loss = tf.reduce_sum(losses)
+    training_op = with_dependencies([training_op, incr_step], loss)
+    return training_op, loss
+
+  def _get_predict_ops(self, features):
+    (all_scores,
+     model_predictions,
+     _,
+     _) = gmm_ops.gmm(
+         features,
+         self._training_initial_clusters,
+         self._num_clusters,
+         self._random_seed,
+         self._covariance_type,
+         self._params)
+    return {
+        GMM.ALL_SCORES: all_scores[0],
+        GMM.ASSIGNMENTS: model_predictions[0]
+    }
+
+  def _get_eval_ops(self, features, _, unused_metrics):
+    (_,
+     _,
+     losses,
+     _) = gmm_ops.gmm(
+         features,
+         self._training_initial_clusters,
+         self._num_clusters,
+         self._random_seed,
+         self._covariance_type,
+         self._params)
+    return {
+        GMM.SCORES: tf.reduce_sum(losses),
+    }
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
new file mode 100644
index 00000000000..e9a64efe2a5
--- /dev/null
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -0,0 +1,461 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Gaussian mixture models Operations."""
+# TODO(xavigonzalvo): Factor out covariance matrix operations to make
+# code reusable for different types (e.g. diag).
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+from tensorflow.python.ops.embedding_ops import embedding_lookup
+
+# Machine epsilon.
+MEPS = np.finfo(float).eps
+FULL_COVARIANCE = 'full'
+DIAG_COVARIANCE = 'diag'
+
+
+def _covariance(x, diag):
+  """Defines the covariance operation of a matrix.
+
+  Args:
+    x: a matrix Tensor. Dimension 0 should contain the number of examples.
+    diag: if True, it computes the diagonal covariance.
+
+  Returns:
+    A Tensor representing the covariance of x. In the case of
+  diagonal matrix just the diagonal is returned.
+  """
+  num_points = tf.to_float(tf.shape(x)[0])
+  x -= tf.reduce_mean(x, 0, keep_dims=True)
+  if diag:
+    cov = tf.reduce_sum(
+        tf.square(x), 0, keep_dims=True) / (num_points - 1)
+  else:
+    cov = tf.matmul(x, x, transpose_a=True)  / (num_points - 1)
+  return cov
+
+
+def _init_clusters_random(data, num_clusters, random_seed):
+  """Does random initialization of clusters.
+
+  Args:
+    data: a list of Tensors with a matrix of data, each row is an example.
+    num_clusters: an integer with the number of clusters.
+    random_seed: Seed for PRNG used to initialize seeds.
+
+  Returns:
+    A Tensor with num_clusters random rows of data.
+  """
+  assert isinstance(data, list)
+  num_data = tf.add_n([tf.shape(inp)[0] for inp in data])
+  with tf.control_dependencies([tf.assert_less_equal(num_clusters, num_data)]):
+    indices = tf.random_uniform([num_clusters],
+                                minval=0,
+                                maxval=tf.cast(num_data, tf.int64),
+                                seed=random_seed,
+                                dtype=tf.int64)
+  indices = tf.cast(indices, tf.int32) % num_data
+  clusters_init = embedding_lookup(data, indices, partition_strategy='div')
+  return clusters_init
+
+
+class GmmAlgorithm(object):
+  """Tensorflow Gaussian mixture model clustering class."""
+  CLUSTERS_VARIABLE = 'clusters'
+  CLUSTERS_COVS_VARIABLE = 'clusters_covs'
+
+  def __init__(self, data, num_classes, initial_means=None, params='wmc',
+               covariance_type=FULL_COVARIANCE, random_seed=0):
+    """Constructor.
+
+    Args:
+      data: a list of Tensors with data, each row is a new example.
+      num_classes: number of clusters.
+      initial_means: a Tensor with a matrix of means. If None, means are
+        computed by sampling randomly.
+      params: Controls which parameters are updated in the training
+        process. Can contain any combination of "w" for weights, "m" for
+        means, and "c" for covariances.
+      covariance_type: one of "full", "diag".
+      random_seed: Seed for PRNG used to initialize seeds.
+
+    Raises:
+      Exception if covariance type is unknown.
+    """
+    self._params = params
+    self._random_seed = random_seed
+    self._covariance_type = covariance_type
+    if self._covariance_type not in [DIAG_COVARIANCE, FULL_COVARIANCE]:
+      raise Exception(  # pylint: disable=g-doc-exception
+          'programmer error: Invalid covariance type: %s' %
+          self._covariance_type)
+    # Create sharded variables for multiple shards. The following
+    # lists are indexed by shard.
+    # Probability per example in a class.
+    num_shards = len(data)
+    self._probs = [None] * num_shards
+    # Prior probability.
+    self._prior_probs = [None] * num_shards
+    # Membership weights w_{ik} where "i" is the i-th example and "k"
+    # is the k-th mixture.
+    self._w = [None] * num_shards
+    # Number of examples in a class.
+    self._points_in_k = [None] * num_shards
+    first_shard = data[0]
+    self._dimensions = tf.shape(first_shard)[1]
+    self._num_classes = num_classes
+    # Small value to guarantee that covariances are invertible.
+    self._min_var = tf.diag(tf.ones(tf.pack([self._dimensions]))) * 1e-3
+    self._create_variables(data, initial_means)
+    # Operations of partial statistics for the computation of the means.
+    self._w_mul_x = []
+    # Operations of partial statistics for the computation of the covariances.
+    self._w_mul_x2 = []
+    self._define_graph(data)
+
+  def _create_variables(self, data, initial_means=None):
+    """Initializes GMM algorithm.
+
+    Args:
+      data: a list of Tensors with data, each row is a new example.
+      initial_means: a Tensor with a matrix of means.
+    """
+    first_shard = data[0]
+    # Initialize means: num_classes X 1 X dimensions.
+    if initial_means is not None:
+      self._means = tf.Variable(tf.expand_dims(initial_means, 1),
+                                name=self.CLUSTERS_VARIABLE,
+                                validate_shape=False, dtype=tf.float32)
+    else:
+      # Sample data randomly
+      self._means = tf.Variable(tf.expand_dims(
+          _init_clusters_random(data, self._num_classes, self._random_seed), 1),
+                                name=self.CLUSTERS_VARIABLE,
+                                validate_shape=False)
+
+    # Initialize covariances.
+    if self._covariance_type == FULL_COVARIANCE:
+      cov = _covariance(first_shard, False) + self._min_var
+      # A matrix per class, num_classes X dimensions X dimensions
+      covs = tf.tile(
+          tf.expand_dims(cov, 0), [self._num_classes, 1, 1])
+    elif self._covariance_type == DIAG_COVARIANCE:
+      cov = _covariance(first_shard, True) + self._min_var
+      # A diagonal per row, num_classes X dimensions.
+      covs = tf.tile(tf.expand_dims(tf.diag_part(cov), 0),
+                     [self._num_classes, 1])
+    self._covs = tf.Variable(covs, name='clusters_covs', validate_shape=False)
+    # Mixture weights, representing the probability that a randomly
+    # selected unobservable data (in EM terms) was generated by component k.
+    self._alpha = tf.Variable(tf.tile([1.0 / self._num_classes],
+                                      [self._num_classes]))
+
+  def training_ops(self):
+    """Returns the training operation."""
+    return self._train_ops
+
+  def alphas(self):
+    return self._alpha
+
+  def clusters(self):
+    """Returns the clusters with dimensions num_classes X 1 X num_dimensions."""
+    return self._means
+
+  def covariances(self):
+    """Returns the covariances matrices."""
+    return self._covs
+
+  def assignments(self):
+    """Returns a list of Tensors with the matrix of assignments per shard."""
+    ret = []
+    for w in self._w:
+      ret.append(tf.argmax(w, 1))
+    return ret
+
+  def scores(self):
+    """Returns the distances to each class.
+
+    Returns:
+      A tuple with two Tensors. The first contains the distance to
+    each class. The second contains the distance to the assigned
+    class.
+    """
+    return (self._all_scores, self._scores)
+
+  def _define_graph(self, data):
+    """Define graph for a single iteration.
+
+    Args:
+      data: a list of Tensors defining the training data.
+    """
+    for shard_id, shard in enumerate(data):
+      self._num_examples = tf.shape(shard)[0]
+      shard = tf.expand_dims(shard, 0)
+      self._define_log_prob_operation(shard_id, shard)
+      self._define_prior_log_prob_operation(shard_id)
+      self._define_expectation_operation(shard_id)
+      self._define_partial_maximization_operation(shard_id, shard)
+    self._define_maximization_operation(len(data))
+    self._define_distance_to_clusters(data)
+
+  def _define_full_covariance_probs(self, shard_id, shard):
+    """Defines the full covariance probabilties per example in a class.
+
+    Updates a matrix with dimension num_examples X num_classes.
+
+    Args:
+      shard_id: id of the current shard.
+      shard: current data shard, 1 X num_examples X dimensions.
+    """
+    diff = shard - self._means
+    cholesky = tf.batch_cholesky(self._covs + self._min_var)
+    log_det_covs = 2.0 * tf.reduce_sum(tf.log(
+        tf.batch_matrix_diag_part(cholesky)), 1)
+    x_mu_cov = tf.square(tf.batch_matrix_triangular_solve(
+        cholesky, tf.transpose(diff, perm=[0, 2, 1]),
+        lower=True))
+    diag_m = tf.transpose(tf.reduce_sum(x_mu_cov, 1))
+    self._probs[shard_id] = -0.5 * (
+        diag_m + tf.to_float(self._dimensions) * tf.log(2 * np.pi) +
+        log_det_covs)
+
+  def _define_diag_covariance_probs(self, shard_id, shard):
+    """Defines the diagonal covariance probabilities per example in a class.
+
+    Args:
+      shard_id: id of the current shard.
+      shard: current data shard, 1 X num_examples X dimensions.
+
+    Returns a matrix num_examples * num_classes.
+    """
+    # num_classes X 1
+    # TODO(xavigonzalvo): look into alternatives to log for
+    # reparametrization of variance parameters.
+    det_expanded = tf.reduce_sum(tf.log(self._covs + 1e-3),
+                                 1, keep_dims=True)
+    diff = shard - self._means
+    x2 = tf.square(diff)
+    cov_expanded = tf.expand_dims(1.0 / (self._covs + 1e-3), 2)
+    # num_classes X num_examples
+    x2_cov = tf.batch_matmul(x2, cov_expanded)
+    x2_cov = tf.transpose(tf.squeeze(x2_cov, [2]))
+    self._probs[shard_id] = -0.5 * (
+        tf.to_float(self._dimensions) * tf.log(2.0 * np.pi) +
+        tf.transpose(det_expanded) + x2_cov)
+
+  def _define_log_prob_operation(self, shard_id, shard):
+    """Probability per example in a class.
+
+    Updates a matrix with dimension num_examples X num_classes.
+
+    Args:
+      shard_id: id of the current shard.
+      shard: current data shard, 1 X num_examples X dimensions.
+    """
+    # TODO(xavigonzalvo): Use the pdf defined in
+    # third_party/tensorflow/contrib/distributions/python/ops/gaussian.py
+    if self._covariance_type == FULL_COVARIANCE:
+      self._define_full_covariance_probs(shard_id, shard)
+    elif self._covariance_type == DIAG_COVARIANCE:
+      self._define_diag_covariance_probs(shard_id, shard)
+    self._probs[shard_id] += tf.log(self._alpha)
+
+  def _define_prior_log_prob_operation(self, shard_id):
+    """Computes the prior probability of all samples.
+
+    Updates a vector where each item is the prior probabibility of an
+    input example.
+
+    Args:
+      shard_id: id of current shard_id.
+    """
+    self._prior_probs[shard_id] = tf.log(
+        tf.reduce_sum(tf.exp(self._probs[shard_id]), 1, keep_dims=True))
+
+  def _define_expectation_operation(self, shard_id):
+    # Shape broadcasting.
+    probs = tf.expand_dims(self._probs[shard_id], 0)
+    # Membership weights are computed as:
+    # w_{ik} = \frac{\alpha_k f(\mathbf{y_i}|\mathbf{\theta}_k)}
+    #               {\sum_{m=1}^{K}\alpha_mf(\mathbf{y_i}|\mathbf{\theta}_m)}
+    # where "i" is the i-th example, "k" is the k-th mixture, theta are
+    # the model parameters and y_i the observations.
+    # These are defined for each shard.
+    self._w[shard_id] = tf.reshape(
+        tf.exp(probs - self._prior_probs[shard_id]),
+        tf.pack([self._num_examples, self._num_classes]))
+
+  def _define_partial_maximization_operation(self, shard_id, shard):
+    """Computes the partial statistics of the means and covariances.
+
+    Args:
+      shard_id: current shard id.
+      shard: current data shard, 1 X num_examples X dimensions.
+    """
+    # Soft assignment of each data point to each of the two clusters.
+    self._points_in_k[shard_id] = tf.reduce_sum(self._w[shard_id], 0,
+                                                keep_dims=True)
+    # Partial means.
+    w_mul_x = tf.expand_dims(
+        tf.matmul(self._w[shard_id],
+                  tf.squeeze(shard, [0]), transpose_a=True), 1)
+    self._w_mul_x.append(w_mul_x)
+    # Partial covariances.
+    x = tf.concat(0, [shard for _ in range(self._num_classes)])
+    x_trans = tf.transpose(x, perm=[0, 2, 1])
+    x_mul_w = tf.concat(0, [
+        tf.expand_dims(x_trans[k, :, :] * self._w[shard_id][:, k], 0)
+        for k in range(self._num_classes)])
+    self._w_mul_x2.append(tf.batch_matmul(x_mul_w, x))
+
+  def _define_maximization_operation(self, num_batches):
+    """Maximization operations."""
+    # TODO(xavigonzalvo): some of these operations could be moved to C++.
+    # Compute the effective number of data points assigned to component k.
+    with tf.control_dependencies(self._w):
+      points_in_k = tf.squeeze(tf.add_n(self._points_in_k), squeeze_dims=[0])
+      # Update alpha.
+      if 'w' in self._params:
+        final_points_in_k = points_in_k / num_batches
+        num_examples = tf.to_float(tf.reduce_sum(final_points_in_k))
+        self._alpha_op = self._alpha.assign(
+            final_points_in_k / (num_examples + MEPS))
+      else:
+        self._alpha_op = tf.no_op()
+      self._train_ops = [self._alpha_op]
+
+      # Update means.
+      points_in_k_expanded = tf.reshape(points_in_k,
+                                        [self._num_classes, 1, 1])
+      if 'm' in self._params:
+        self._means_op = self._means.assign(
+            tf.div(tf.add_n(self._w_mul_x), points_in_k_expanded + MEPS))
+      else:
+        self._means_op = tf.no_op()
+      # means are (num_classes x 1 x dims)
+
+      # Update covariances.
+      with tf.control_dependencies([self._means_op]):
+        b = tf.add_n(self._w_mul_x2) / (points_in_k_expanded + MEPS)
+        new_covs = []
+        for k in range(self._num_classes):
+          mean = self._means.ref()[k, :, :]
+          square_mean = tf.matmul(mean, mean, transpose_a=True)
+          new_cov = b[k, :, :] - square_mean + self._min_var
+          if self._covariance_type == FULL_COVARIANCE:
+            new_covs.append(tf.expand_dims(new_cov, 0))
+          elif self._covariance_type == DIAG_COVARIANCE:
+            new_covs.append(tf.expand_dims(tf.diag_part(new_cov), 0))
+        new_covs = tf.concat(0, new_covs)
+        if 'c' in self._params:
+          # Train operations don't need to take care of the means
+          # because covariances already depend on it.
+          with tf.control_dependencies([self._means_op, new_covs]):
+            self._train_ops.append(
+                tf.assign(self._covs, new_covs, validate_shape=False))
+
+  def _define_distance_to_clusters(self, data):
+    """Defines the Mahalanobis distance to the assigned Gaussian."""
+    # TODO(xavigonzalvo): reuse (input - mean) * cov^-1 * (input -
+    # mean) from log probability function.
+    self._all_scores = []
+    for shard in data:
+      all_scores = []
+      shard = tf.expand_dims(shard, 0)
+      for c in xrange(self._num_classes):
+        if self._covariance_type == FULL_COVARIANCE:
+          cov = self._covs[c, :, :]
+        elif self._covariance_type == DIAG_COVARIANCE:
+          cov = tf.diag(self._covs[c, :])
+        inverse = tf.matrix_inverse(cov + self._min_var)
+        inv_cov = tf.tile(
+            tf.expand_dims(inverse, 0),
+            tf.pack([self._num_examples, 1, 1]))
+        diff = tf.transpose(shard - self._means[c, :, :], perm=[1, 0, 2])
+        m_left = tf.batch_matmul(diff, inv_cov)
+        all_scores.append(tf.sqrt(tf.batch_matmul(
+            m_left, tf.transpose(diff, perm=[0, 2, 1])
+        )))
+      self._all_scores.append(tf.reshape(
+          tf.concat(1, all_scores),
+          tf.pack([self._num_examples, self._num_classes])))
+
+    # Distance to the associated class.
+    self._all_scores = tf.concat(0, self._all_scores)
+    assignments = tf.concat(0, self.assignments())
+    rows = tf.to_int64(tf.range(0, self._num_examples))
+    indices = tf.concat(1, [tf.expand_dims(rows, 1),
+                            tf.expand_dims(assignments, 1)])
+    self._scores = tf.gather_nd(self._all_scores, indices)
+
+  def _define_loglikelihood_operation(self):
+    """Defines the total log-likelihood of current iteration."""
+    self._ll_op = []
+    for prior_probs in self._prior_probs:
+      self._ll_op.append(tf.reduce_sum(tf.log(prior_probs)))
+    tf.scalar_summary('ll', tf.reduce_sum(self._ll_op))
+
+
+def gmm(inp, initial_clusters, num_clusters, random_seed,
+        covariance_type=FULL_COVARIANCE, params='wmc'):
+  """Creates the graph for Gaussian mixture model (GMM) clustering.
+
+  Args:
+    inp: An input tensor or list of input tensors
+    initial_clusters: Specifies the clusters used during
+      initialization. Can be a tensor or numpy array, or a function
+      that generates the clusters. Can also be "random" to specify
+      that clusters should be chosen randomly from input data. Note: type
+      is diverse to be consistent with skflow.
+    num_clusters: number of clusters.
+    random_seed: Python integer. Seed for PRNG used to initialize centers.
+    covariance_type: one of "diag", "full".
+    params: Controls which parameters are updated in the training
+      process. Can contain any combination of "w" for weights, "m" for
+      means, and "c" for covars.
+
+  Returns:
+    Note: tuple of lists returned to be consistent with skflow
+    A tuple consisting of:
+    all_scores: A matrix (or list of matrices) of dimensions (num_input,
+      num_clusters) where the value is the distance of an input vector and a
+      cluster center.
+    assignments: A vector (or list of vectors). Each element in the vector
+      corresponds to an input row in 'inp' and specifies the cluster id
+      corresponding to the input.
+    scores: Similar to assignments but specifies the distance to the
+      assigned cluster instead.
+    training_op: an op that runs an iteration of training.
+  """
+  initial_means = None
+  if initial_clusters != 'random' and not isinstance(
+      initial_clusters, tf.Tensor):
+    initial_means = tf.constant(initial_clusters, dtype=tf.float32)
+
+  # Implementation of GMM.
+  inp = inp if isinstance(inp, list) else [inp]
+  gmm_tool = GmmAlgorithm(inp, num_clusters, initial_means, params,
+                          covariance_type, random_seed)
+  training_ops = gmm_tool.training_ops()
+  assignments = gmm_tool.assignments()
+  all_scores, scores = gmm_tool.scores()
+  return [all_scores], [assignments], [scores], tf.group(*training_ops)
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py b/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
new file mode 100644
index 00000000000..a1bc0dca7ba
--- /dev/null
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
@@ -0,0 +1,198 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for gmm_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+from tensorflow.contrib.factorization.python.ops import gmm_ops
+from tensorflow.python.platform import tf_logging as logging
+
+
+class GmmOpsTest(tf.test.TestCase):
+
+  def setUp(self):
+    self.num_examples = 1000
+    self.iterations = 40
+    self.seed = 4
+    tf.set_random_seed(self.seed)
+    np.random.seed(self.seed * 2)
+    self.data, self.true_assignments = self.make_data(self.num_examples)
+    # Generate more complicated data.
+    self.centers = [[1, 1], [-1, 0.5], [2, 1]]
+    self.more_data, self.more_true_assignments = self.make_data_from_centers(
+        self.num_examples, self.centers)
+
+  @staticmethod
+  def make_data(num_vectors):
+    """Generates 2-dimensional data centered on (2,2), (-1,-1).
+
+    Args:
+      num_vectors: number of training examples.
+
+    Returns:
+      A tuple containing the data as a numpy array and the cluster ids.
+    """
+    vectors = []
+    classes = []
+    for _ in xrange(num_vectors):
+      if np.random.random() > 0.5:
+        vectors.append([np.random.normal(2.0, 0.6),
+                        np.random.normal(2.0, 0.9)])
+        classes.append(0)
+      else:
+        vectors.append([np.random.normal(-1.0, 0.4),
+                        np.random.normal(-1.0, 0.5)])
+        classes.append(1)
+    return np.asarray(vectors), classes
+
+  @staticmethod
+  def make_data_from_centers(num_vectors, centers):
+    """Generates 2-dimensional data with random centers.
+
+    Args:
+      num_vectors: number of training examples.
+      centers: a list of random 2-dimensional centers.
+
+    Returns:
+      A tuple containing the data as a numpy array and the cluster ids.
+    """
+    vectors = []
+    classes = []
+    for _ in xrange(num_vectors):
+      current_class = np.random.random_integers(0, len(centers) - 1)
+      vectors.append([np.random.normal(centers[current_class][0],
+                                       np.random.random_sample()),
+                      np.random.normal(centers[current_class][1],
+                                       np.random.random_sample())])
+      classes.append(current_class)
+    return np.asarray(vectors), len(centers)
+
+  def test_covariance(self):
+    start_time = time.time()
+    data = self.data.T
+    np_cov = np.cov(data)
+    logging.info('Numpy took %f', time.time() - start_time)
+
+    start_time = time.time()
+    with self.test_session() as sess:
+      op = gmm_ops._covariance(
+          tf.constant(data.T, dtype=tf.float32),
+          False)
+      op_diag = gmm_ops._covariance(
+          tf.constant(data.T, dtype=tf.float32),
+          True)
+      tf.initialize_all_variables().run()
+      tf_cov = sess.run(op)
+      np.testing.assert_array_almost_equal(np_cov, tf_cov)
+      logging.info('Tensorflow took %f', time.time() - start_time)
+      tf_cov = sess.run(op_diag)
+      np.testing.assert_array_almost_equal(
+          np.diag(np_cov), np.ravel(tf_cov), decimal=5)
+
+  def test_simple_cluster(self):
+    """Tests that the clusters are correct."""
+    num_classes = 2
+    graph = tf.Graph()
+    with graph.as_default() as g:
+      g.seed = 5
+      with self.test_session() as sess:
+        data = tf.constant(self.data, dtype=tf.float32)
+        _, assignments, _, training_op = gmm_ops.gmm(data, 'random',
+                                                     num_classes,
+                                                     random_seed=self.seed)
+
+        tf.initialize_all_variables().run()
+        for _ in xrange(self.iterations):
+          sess.run(training_op)
+        assignments = sess.run(assignments)
+        accuracy = np.mean(
+            np.asarray(self.true_assignments) == np.squeeze(assignments))
+        logging.info('Accuracy: %f', accuracy)
+        self.assertGreater(accuracy, 0.98)
+
+  def testParams(self):
+    """Tests that the params work as intended."""
+    num_classes = 2
+    with self.test_session() as sess:
+      # Experiment 1. Update weights only.
+      data = tf.constant(self.data, dtype=tf.float32)
+      gmm_tool = gmm_ops.GmmAlgorithm([data], num_classes,
+                                      [[3.0, 3.0], [0.0, 0.0]], 'w')
+      training_ops = gmm_tool.training_ops()
+      tf.initialize_all_variables().run()
+      for _ in xrange(self.iterations):
+        sess.run(training_ops)
+
+      # Only the probability to each class is updated.
+      alphas = sess.run(gmm_tool.alphas())
+      self.assertGreater(alphas[1], 0.6)
+      means = sess.run(gmm_tool.clusters())
+      np.testing.assert_almost_equal(
+          np.expand_dims([[3.0, 3.0], [0.0, 0.0]], 1), means)
+      covs = sess.run(gmm_tool.covariances())
+      np.testing.assert_almost_equal(covs[0], covs[1])
+
+      # Experiment 2. Update means and covariances.
+      gmm_tool = gmm_ops.GmmAlgorithm([data], num_classes,
+                                      [[3.0, 3.0], [0.0, 0.0]], 'mc')
+      training_ops = gmm_tool.training_ops()
+      tf.initialize_all_variables().run()
+      for _ in xrange(self.iterations):
+        sess.run(training_ops)
+      alphas = sess.run(gmm_tool.alphas())
+      self.assertAlmostEqual(alphas[0], alphas[1])
+      means = sess.run(gmm_tool.clusters())
+      np.testing.assert_almost_equal(
+          np.expand_dims([[2.0, 2.0], [-1.0, -1.0]], 1), means, decimal=1)
+      covs = sess.run(gmm_tool.covariances())
+      np.testing.assert_almost_equal(
+          [[0.371111, -0.0050774], [-0.0050774, 0.8651744]],
+          covs[0], decimal=4)
+      np.testing.assert_almost_equal(
+          [[0.146976, 0.0259463], [0.0259463, 0.2543971]],
+          covs[1], decimal=4)
+
+      # Experiment 3. Update covariances only.
+      gmm_tool = gmm_ops.GmmAlgorithm([data], num_classes,
+                                      [[-1.0, -1.0], [1.0, 1.0]], 'c')
+      training_ops = gmm_tool.training_ops()
+      tf.initialize_all_variables().run()
+      for _ in xrange(self.iterations):
+        sess.run(training_ops)
+      alphas = sess.run(gmm_tool.alphas())
+      self.assertAlmostEqual(alphas[0], alphas[1])
+      means = sess.run(gmm_tool.clusters())
+      np.testing.assert_almost_equal(
+          np.expand_dims([[-1.0, -1.0], [1.0, 1.0]], 1), means)
+      covs = sess.run(gmm_tool.covariances())
+      np.testing.assert_almost_equal(
+          [[0.1299582, 0.0435872], [0.0435872, 0.2558578]],
+          covs[0], decimal=5)
+      np.testing.assert_almost_equal(
+          [[3.195385, 2.6989155], [2.6989155, 3.3881593]],
+          covs[1], decimal=5)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_test.py b/tensorflow/contrib/factorization/python/ops/gmm_test.py
new file mode 100644
index 00000000000..323133e0dff
--- /dev/null
+++ b/tensorflow/contrib/factorization/python/ops/gmm_test.py
@@ -0,0 +1,172 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for ops.gmm."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+from tensorflow.contrib.factorization.python.ops.gmm import GMM
+from tensorflow.contrib.factorization.python.ops.kmeans import KMeansClustering as KMeans
+from tensorflow.contrib.learn.python.learn.estimators import run_config
+
+FLAGS = tf.app.flags.FLAGS
+
+
+class GMMTest(tf.test.TestCase):
+
+  def setUp(self):
+    np.random.seed(3)
+    tf.set_random_seed(2)
+    self.num_centers = 2
+    self.num_dims = 2
+    self.num_points = 4000
+    self.batch_size = 100
+    self.true_centers = self.make_random_centers(self.num_centers,
+                                                 self.num_dims)
+    self.points, self.assignments, self.scores = self.make_random_points(
+        self.true_centers,
+        self.num_points)
+    self.true_score = np.add.reduce(self.scores)
+
+    # Use initial means from kmeans (just like scikit-learn does).
+    clusterer = KMeans(num_clusters=self.num_centers)
+    clusterer.fit(self.points, steps=30)
+    self.initial_means = clusterer.clusters()
+
+  @staticmethod
+  def make_random_centers(num_centers, num_dims):
+    return np.round(np.random.rand(num_centers,
+                                   num_dims).astype(np.float32) * 500)
+
+  @staticmethod
+  def make_random_points(centers, num_points):
+    num_centers, num_dims = centers.shape
+    assignments = np.random.choice(num_centers, num_points)
+    offsets = np.round(np.random.randn(num_points,
+                                       num_dims).astype(np.float32) * 20)
+    points = centers[assignments] + offsets
+    means = [np.mean(points[assignments == center], axis=0)
+             for center in xrange(num_centers)]
+    covs = [np.cov(points[assignments == center].T)
+            for center in xrange(num_centers)]
+    scores = []
+    for r in xrange(num_points):
+      scores.append(np.sqrt(np.dot(
+          np.dot(points[r, :] - means[assignments[r]],
+                 np.linalg.inv(covs[assignments[r]])),
+          points[r, :] - means[assignments[r]])))
+    return (points, assignments, scores)
+
+  def test_clusters(self):
+    """Tests the shape of the clusters."""
+    gmm = GMM(self.num_centers,
+              initial_clusters=self.initial_means,
+              batch_size=self.batch_size,
+              steps=40,
+              continue_training=True,
+              random_seed=4,
+              config=run_config.RunConfig(tf_random_seed=2))
+    gmm.fit(x=self.points, steps=0)
+    clusters = gmm.clusters()
+    self.assertAllEqual(list(clusters.shape),
+                        [self.num_centers, self.num_dims])
+
+  def test_fit(self):
+    gmm = GMM(self.num_centers,
+              initial_clusters='random',
+              batch_size=self.batch_size,
+              random_seed=4,
+              config=run_config.RunConfig(tf_random_seed=2))
+    gmm.fit(x=self.points, steps=1)
+    score1 = gmm.score(x=self.points)
+    gmm = GMM(self.num_centers,
+              initial_clusters='random',
+              batch_size=self.batch_size,
+              random_seed=4,
+              config=run_config.RunConfig(tf_random_seed=2))
+    gmm.fit(x=self.points, steps=10)
+    score2 = gmm.score(x=self.points)
+    self.assertGreater(score1, score2)
+    self.assertNear(self.true_score, score2, self.true_score * 0.15)
+
+  def test_infer(self):
+    gmm = GMM(self.num_centers,
+              initial_clusters=self.initial_means,
+              batch_size=self.batch_size,
+              steps=40,
+              continue_training=True,
+              random_seed=4,
+              config=run_config.RunConfig(tf_random_seed=2))
+    gmm.fit(x=self.points, steps=60)
+    clusters = gmm.clusters()
+
+    # Make a small test set
+    points, true_assignments, true_offsets = (
+        self.make_random_points(clusters, 40))
+
+    assignments = np.ravel(gmm.predict(points))
+    self.assertAllEqual(true_assignments, assignments)
+
+    # Test score
+    score = gmm.score(points)
+    self.assertNear(score, np.sum(true_offsets), 4.05)
+
+  def _compare_with_sklearn(self, cov_type):
+    # sklearn version.
+    iterations = 40
+    np.random.seed(5)
+    sklearn_assignments = np.asarray([0, 0, 1, 0, 0, 0, 1, 0, 0, 1])
+    sklearn_means = np.asarray([[144.83417719, 254.20130341],
+                                [274.38754816, 353.16074346]])
+    sklearn_covs = np.asarray([[[395.0081194, -4.50389512],
+                                [-4.50389512, 408.27543989]],
+                               [[385.17484203, -31.27834935],
+                                [-31.27834935, 391.74249925]]])
+
+    # skflow version.
+    gmm = GMM(self.num_centers,
+              initial_clusters=self.initial_means,
+              covariance_type=cov_type,
+              batch_size=self.num_points,
+              steps=iterations,
+              continue_training=True,
+              config=run_config.RunConfig(tf_random_seed=2))
+    gmm.fit(self.points)
+    skflow_assignments = gmm.predict(self.points[:10, :]).astype(int)
+    self.assertAllClose(sklearn_assignments,
+                        np.ravel(skflow_assignments))
+    self.assertAllClose(sklearn_means, gmm.clusters())
+    if cov_type == 'full':
+      self.assertAllClose(sklearn_covs, gmm.covariances(), rtol=0.01)
+    else:
+      for d in [0, 1]:
+        self.assertAllClose(np.diag(sklearn_covs[d]),
+                            gmm.covariances()[d, :], rtol=0.01)
+
+  def test_compare_full(self):
+    self._compare_with_sklearn('full')
+
+  def test_compare_diag(self):
+    self._compare_with_sklearn('diag')
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans_test.py b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
index bc706453c13..4fc2bea515a 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans_test.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
@@ -153,9 +153,11 @@ class KMeansTest(tf.test.TestCase):
   def test_fit_with_cosine_distance(self):
     # Create points on y=x and y=1.5x lines to check the cosine similarity.
     # Note that euclidean distance will give different results in this case.
-    points = np.array([[9, 9], [0.5, 0.5], [10, 15], [0.4, 0.6]])
+    points = np.array(
+        [[9, 9], [0.5, 0.5], [10, 15], [0.4, 0.6]], dtype=np.float32)
     # true centers are the unit vectors on lines y=x and y=1.5x
-    true_centers = np.array([[0.70710678, 0.70710678], [0.5547002, 0.83205029]])
+    true_centers = np.array(
+        [[0.70710678, 0.70710678], [0.5547002, 0.83205029]], dtype=np.float32)
     kmeans = KMeans(2,
                     initial_clusters=kmeans_ops.RANDOM_INIT,
                     distance_metric=kmeans_ops.COSINE_DISTANCE,
@@ -168,8 +170,9 @@ class KMeansTest(tf.test.TestCase):
                         np.sort(true_centers, axis=0))
 
   def test_transform_with_cosine_distance(self):
-    points = np.array([[2.5, 3.5], [2, 8], [3, 1], [3, 18],
-                       [-2.5, -3.5], [-2, -8], [-3, -1], [-3, -18]])
+    points = np.array(
+        [[2.5, 0.1], [2, 0.2], [3, 0.1], [4, 0.2],
+         [0.1, 2.5], [0.2, 2], [0.1, 3], [0.2, 4]], dtype=np.float32)
 
     true_centers = [normalize(np.mean(normalize(points)[4:, :], axis=0,
                                       keepdims=True))[0],
@@ -180,8 +183,8 @@ class KMeansTest(tf.test.TestCase):
                     initial_clusters=kmeans_ops.RANDOM_INIT,
                     distance_metric=kmeans_ops.COSINE_DISTANCE,
                     use_mini_batch=self.use_mini_batch,
-                    config=self.config(3))
-    kmeans.fit(x=points, steps=30, batch_size=8)
+                    config=self.config(5))
+    kmeans.fit(x=points, steps=50, batch_size=8)
 
     centers = normalize(kmeans.clusters())
     self.assertAllClose(np.sort(centers, axis=0),
@@ -193,16 +196,16 @@ class KMeansTest(tf.test.TestCase):
     self.assertAllClose(transform, true_transform, atol=1e-3)
 
   def test_predict_with_cosine_distance(self):
-    points = np.array([[2.5, 3.5], [2, 8], [3, 1], [3, 18],
-                       [-2.5, -3.5], [-2, -8], [-3, -1], [-3, -18]]).astype(
-                           np.float32)
+    points = np.array(
+        [[2.5, 0.1], [2, 0.2], [3, 0.1], [4, 0.2],
+         [0.1, 2.5], [0.2, 2], [0.1, 3], [0.2, 4]], dtype=np.float32)
     true_centers = np.array(
         [normalize(np.mean(normalize(points)[0:4, :],
                            axis=0,
                            keepdims=True))[0],
          normalize(np.mean(normalize(points)[4:, :],
                            axis=0,
-                           keepdims=True))[0]])
+                           keepdims=True))[0]], dtype=np.float32)
     true_assignments = [0] * 4 + [1] * 4
     true_score = len(points) - np.tensordot(normalize(points),
                                             true_centers[true_assignments])
@@ -230,14 +233,14 @@ class KMeansTest(tf.test.TestCase):
     # the less populated centers.
     points = np.array([[2.5, 3.5], [2.5, 3.5], [-2, 3], [-2, 3], [-3, -3],
                        [-3.1, -3.2], [-2.8, -3.], [-2.9, -3.1], [-3., -3.1],
-                       [-3., -3.1], [-3.2, -3.], [-3., -3.]]).astype(np.float32)
+                       [-3., -3.1], [-3.2, -3.], [-3., -3.]], dtype=np.float32)
     true_centers = np.array(
         [normalize(np.mean(normalize(points)[0:2, :], axis=0,
                            keepdims=True))[0],
          normalize(np.mean(normalize(points)[2:4, :], axis=0,
                            keepdims=True))[0],
          normalize(np.mean(normalize(points)[4:, :], axis=0,
-                           keepdims=True))[0]])
+                           keepdims=True))[0]], dtype=np.float32)
     true_assignments = [0] * 2 + [1] * 2 + [2] * 8
     true_score = len(points) - np.tensordot(normalize(points),
                                             true_centers[true_assignments])
@@ -262,7 +265,7 @@ class KMeansTest(tf.test.TestCase):
     self.assertAllClose(score, true_score, atol=1e-2)
 
   def test_fit_raise_if_num_clusters_larger_than_num_points_random_init(self):
-    points = np.array([[2.0, 3.0], [1.6, 8.2]])
+    points = np.array([[2.0, 3.0], [1.6, 8.2]], dtype=np.float32)
 
     with self.assertRaisesOpError('less'):
       kmeans = KMeans(num_clusters=3, initial_clusters=kmeans_ops.RANDOM_INIT)
@@ -270,7 +273,7 @@ class KMeansTest(tf.test.TestCase):
 
   def test_fit_raise_if_num_clusters_larger_than_num_points_kmeans_plus_plus(
       self):
-    points = np.array([[2.0, 3.0], [1.6, 8.2]])
+    points = np.array([[2.0, 3.0], [1.6, 8.2]], dtype=np.float32)
 
     with self.assertRaisesOpError(AssertionError):
       kmeans = KMeans(num_clusters=3,
diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op.cc b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
index ecfb6705a0f..10e35e165b2 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
@@ -21,10 +21,12 @@
 #include "tensorflow/contrib/ffmpeg/ffmpeg_lib.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace ffmpeg {
@@ -62,13 +64,11 @@ class FileDeleter {
 
 class DecodeAudioOp : public OpKernel {
  public:
-  explicit DecodeAudioOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+  explicit DecodeAudioOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("file_format", &file_format_));
     file_format_ = str_util::Lowercase(file_format_);
     const std::set<string> valid_file_formats(
-        kValidFileFormats,
-        kValidFileFormats + TF_ARRAYSIZE(kValidFileFormats));
+        kValidFileFormats, kValidFileFormats + TF_ARRAYSIZE(kValidFileFormats));
     OP_REQUIRES(context, valid_file_formats.count(file_format_) == 1,
                 errors::InvalidArgument(
                     "file_format arg must be in {",
@@ -79,8 +79,7 @@ class DecodeAudioOp : public OpKernel {
     OP_REQUIRES(context, samples_per_second_ > 0,
                 errors::InvalidArgument("samples_per_second must be > 0."));
 
-    OP_REQUIRES_OK(
-        context, context->GetAttr("channel_count", &channel_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("channel_count", &channel_count_));
     OP_REQUIRES(context, channel_count_ > 0,
                 errors::InvalidArgument("channel_count must be > 0."));
   }
@@ -112,12 +111,18 @@ class DecodeAudioOp : public OpKernel {
           context, result.ok(),
           errors::Unavailable("FFmpeg must be installed to run this op. FFmpeg "
                               "can be found at http://www.ffmpeg.org."));
+    } else if (result.code() == error::UNKNOWN) {
+      LOG(ERROR) << "Ffmpeg failed with error '" << result.error_message()
+                 << "'. Returning empty tensor.";
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, TensorShape({0, 0}), &output));
+      return;
     } else {
       OP_REQUIRES_OK(context, result);
     }
-    OP_REQUIRES(
-        context, !output_samples.empty(),
-        errors::Unknown("No output created by FFmpeg."));
+    OP_REQUIRES(context, !output_samples.empty(),
+                errors::Unknown("No output created by FFmpeg."));
     OP_REQUIRES(
         context, output_samples.size() % channel_count_ == 0,
         errors::Unknown("FFmpeg created non-integer number of audio frames."));
@@ -125,9 +130,9 @@ class DecodeAudioOp : public OpKernel {
     // Copy the output data to the output Tensor.
     Tensor* output = nullptr;
     const int64 frame_count = output_samples.size() / channel_count_;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(
-            0, TensorShape({frame_count, channel_count_}), &output));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0, TensorShape({frame_count, channel_count_}), &output));
     auto matrix = output->tensor<float, 2>();
     for (int32 frame = 0; frame < frame_count; ++frame) {
       for (int32 channel = 0; channel < channel_count_; ++channel) {
@@ -151,6 +156,15 @@ REGISTER_OP("DecodeAudio")
     .Attr("file_format: string")
     .Attr("samples_per_second: int")
     .Attr("channel_count: int")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      int64 channels;
+      if (c->GetAttr("channel_count", &channels).ok()) {
+        c->set_output(0, c->Matrix(c->UnknownDim(), channels));
+      } else {
+        c->set_output(0, c->Matrix(c->UnknownDim(), c->UnknownDim()));
+      }
+      return Status::OK();
+    })
     .Doc(R"doc(
 Processes the contents of an audio file into a tensor using FFmpeg to decode
 the file.
@@ -162,7 +176,8 @@ different from the contents of the file, channels will be merged or created.
 
 contents: The binary audio file contents.
 sampled_audio: A rank 2 tensor containing all tracks of the audio. Dimension 0
-    is time and dimension 1 is the channel.
+    is time and dimension 1 is the channel. If ffmpeg fails to decode the audio
+    then an empty tensor will be returned.
 file_format: A string describing the audio file format. This can be "wav" or
     "mp3".
 samples_per_second: The number of samples per second that the audio should have.
diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op_test.py b/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
index a18d438b24f..58d0ab11b1d 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
@@ -72,6 +72,14 @@ class DecodeAudioOpTest(tf.test.TestCase):
   def testOgg(self):
     self._loadFileAndTest('mono_10khz.ogg', 'ogg', 0.57, 10000, 1)
 
+  def testInvalidFile(self):
+    with self.test_session():
+      contents = 'invalid file'
+      audio_op = ffmpeg.decode_audio(contents, file_format='wav',
+                                     samples_per_second=10000, channel_count=2)
+      audio = audio_op.eval()
+      self.assertEqual(audio.shape, (0, 0))
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index 510426cc034..9db453f0dd2 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -38,7 +38,6 @@ namespace {
 const char kFfmpegExecutable[] = "ffmpeg";
 const int32 kDefaultProbeSize = 5000000;  // 5MB
 
-
 std::vector<string> FfmpegCommandLine(const string& input_filename,
                                       const string& output_filename,
                                       const string& input_format_id,
@@ -63,6 +62,39 @@ std::vector<string> FfmpegCommandLine(const string& input_filename,
   };
 }
 
+// Is a named binary installed and executable by the current process?
+// Note that this is harder than it seems like it should be...
+bool IsBinaryInstalled(const string& binary_name) {
+  string path = ::getenv("PATH");
+  for (const string& dir : str_util::Split(path, ':')) {
+    const string binary_path = io::JoinPath(dir, binary_name);
+    char absolute_path[PATH_MAX + 1];
+    ::realpath(binary_path.c_str(), absolute_path);
+    struct stat statinfo;
+    int result = ::stat(absolute_path, &statinfo);
+    if (result < 0) {
+      continue;
+    }
+    if (!S_ISREG(statinfo.st_mode)) {
+      continue;
+    }
+
+    // Is the current user able to execute the file?
+    if (statinfo.st_uid == ::geteuid() && statinfo.st_mode & S_IXUSR) {
+      return true;
+    }
+    // Is the current group able to execute the file?
+    if (statinfo.st_uid == ::getegid() && statinfo.st_mode & S_IXGRP) {
+      return true;
+    }
+    // Is anyone able to execute the file?
+    if (statinfo.st_mode & S_IXOTH) {
+      return true;
+    }
+  }
+  return false;
+}
+
 [[noreturn]] int ExecuteFfmpeg(const std::vector<string>& args) {
   std::vector<char*> args_chars;
   std::transform(args.begin(), args.end(), std::back_inserter(args_chars),
@@ -191,6 +223,14 @@ Status ReadAudioFile(const string& filename,
       FfmpegCommandLine(filename, output_filename, audio_format_id,
                         samples_per_second, channel_count);
 
+  // Unfortunately, it's impossible to differentiate an exec failure due to the
+  // binary being missing and an error from the binary's execution. Therefore,
+  // check to see if the binary *should* be available. If not, return an error
+  // that will be converted into a helpful error message by the TensorFlow op.
+  if (!IsBinaryInstalled(kFfmpegExecutable)) {
+    return Status(error::Code::NOT_FOUND, StrCat("FFmpeg could not be found."));
+  }
+
   // Execute ffmpeg and report errors.
   pid_t child_pid = ::fork();
   if (child_pid < 0) {
@@ -202,7 +242,7 @@ Status ReadAudioFile(const string& filename,
     int status_code;
     ::waitpid(child_pid, &status_code, 0);
     if (status_code) {
-      return Status(error::Code::NOT_FOUND,
+      return Status(error::Code::UNKNOWN,
                     StrCat("FFmpeg execution failed: ", status_code));
     }
     *output_samples = ReadPcmFile(output_filename);
diff --git a/tensorflow/contrib/ffmpeg/encode_audio_op.cc b/tensorflow/contrib/ffmpeg/encode_audio_op.cc
index 818285be5c1..bd3d6ae6998 100644
--- a/tensorflow/contrib/ffmpeg/encode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/encode_audio_op.cc
@@ -16,6 +16,7 @@
 #include <limits>
 
 #include "tensorflow/contrib/ffmpeg/ffmpeg_lib.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -24,8 +25,7 @@ namespace ffmpeg {
 
 class EncodeAudioOp : public OpKernel {
  public:
-  explicit EncodeAudioOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+  explicit EncodeAudioOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("file_format", &file_format_));
     file_format_ = str_util::Lowercase(file_format_);
     OP_REQUIRES(context, file_format_ == "wav",
@@ -35,15 +35,15 @@ class EncodeAudioOp : public OpKernel {
         context, context->GetAttr("samples_per_second", &samples_per_second_));
     OP_REQUIRES(context, samples_per_second_ > 0,
                 errors::InvalidArgument("samples_per_second must be > 0."));
-    OP_REQUIRES_OK(
-        context, context->GetAttr("bits_per_second", &bits_per_second_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("bits_per_second", &bits_per_second_));
   }
 
   void Compute(OpKernelContext* context) override {
     // Get and verify the input data.
-    OP_REQUIRES(context, context->num_inputs() == 1,
-                errors::InvalidArgument(
-                    "EncodeAudio requires exactly one input."));
+    OP_REQUIRES(
+        context, context->num_inputs() == 1,
+        errors::InvalidArgument("EncodeAudio requires exactly one input."));
     const Tensor& contents = context->input(0);
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(contents.shape()),
                 errors::InvalidArgument(
@@ -88,6 +88,7 @@ REGISTER_OP("EncodeAudio")
     .Attr("file_format: string")
     .Attr("samples_per_second: int")
     .Attr("bits_per_second: int = 192000")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Processes a `Tensor` containing sampled audio with the number of channels
 and length of the audio specified by the dimensions of the `Tensor`. The
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index ff33674f76b..b32242862eb 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -67,7 +67,8 @@ def decode_audio(contents, file_format=None, samples_per_second=None,
   Returns:
     A rank 2 tensor that has time along dimension 0 and channels along
     dimension 1. Dimension 0 will be `samples_per_second * length` wide, and
-    dimension 1 will be `channel_count` wide.
+    dimension 1 will be `channel_count` wide. If ffmpeg fails to decode the
+    audio then an empty tensor will be returned.
   """
   return gen_decode_audio_op_py.decode_audio(
       contents, file_format=file_format, samples_per_second=samples_per_second,
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index 58ba08d2f1c..b77fe259f84 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -14,6 +14,7 @@ py_library(
     srcs = [
         "__init__.py",
         "python/framework/__init__.py",
+        "python/framework/checkpoint_utils.py",
         "python/framework/deprecation.py",
         "python/framework/tensor_util.py",
         "python/ops/__init__.py",
@@ -35,10 +36,19 @@ py_test(
     deps = ["//tensorflow:tensorflow_py"],
 )
 
+py_test(
+    name = "checkpoint_utils_test",
+    size = "small",
+    srcs = ["python/framework/checkpoint_utils_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["manual"],  # http://b/30468735
+    deps = ["//tensorflow:tensorflow_py"],
+)
+
 py_test(
     name = "ops_test",
     size = "small",
-    srcs = glob(["python/ops/ops_test.py"]),
+    srcs = ["python/ops/ops_test.py"],
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
@@ -51,9 +61,16 @@ py_test(
     deps = ["//tensorflow:tensorflow_py"],
 )
 
+py_test(
+    name = "deprecation_test",
+    srcs = ["python/framework/deprecation_test.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow:tensorflow_py"],
+)
+
 py_test(
     name = "tensor_util_test",
-    srcs = glob(["python/framework/tensor_util_test.py"]),
+    srcs = ["python/framework/tensor_util_test.py"],
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
@@ -61,7 +78,7 @@ py_test(
 py_test(
     name = "variables_test",
     size = "small",
-    srcs = glob(["python/ops/variables_test.py"]),
+    srcs = ["python/ops/variables_test.py"],
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
@@ -74,6 +91,15 @@ py_test(
     deps = ["//tensorflow:tensorflow_py"],
 )
 
+py_test(
+    name = "sampling_ops_threading_test",
+    size = "small",
+    srcs = ["python/ops/sampling_ops_threading_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = ["//tensorflow:tensorflow_py"],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index c8cca813bbd..1510683b365 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -30,6 +30,7 @@
 
 ## Deprecation
 @@deprecated
+@@deprecated_arg_values
 
 ## Arg_Scope
 @@arg_scope
diff --git a/tensorflow/contrib/framework/python/framework/__init__.py b/tensorflow/contrib/framework/python/framework/__init__.py
index d7724ba8e58..033faa6757f 100644
--- a/tensorflow/contrib/framework/python/framework/__init__.py
+++ b/tensorflow/contrib/framework/python/framework/__init__.py
@@ -19,5 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.framework.python.framework.checkpoint_utils import *
 from tensorflow.contrib.framework.python.framework.deprecation import deprecated
+from tensorflow.contrib.framework.python.framework.deprecation import deprecated_arg_values
 from tensorflow.contrib.framework.python.framework.tensor_util import *
diff --git a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
new file mode 100644
index 00000000000..d563fa20c21
--- /dev/null
+++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
@@ -0,0 +1,288 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tools to work with checkpoints."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.ops import gen_io_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import saver
+from tensorflow.python.training import training as train
+
+__all__ = [
+    "load_checkpoint",
+    "load_variable",
+    "list_variables",
+    "init_from_checkpoint"]
+
+
+def _get_checkpoint_filename(filepattern):
+  """Returns checkpoint filename given directory or specific filepattern."""
+  if gfile.IsDirectory(filepattern):
+    return saver.latest_checkpoint(filepattern)
+  return filepattern
+
+
+def load_checkpoint(filepattern):
+  """Returns CheckpointReader for latest checkpoint.
+
+  Args:
+    filepattern: Directory with checkpoints file or path to checkpoint.
+
+  Returns:
+    `CheckpointReader` object.
+
+  Raises:
+    ValueError: if checkpoint_dir doesn't have 'checkpoint' file or checkpoints.
+  """
+  filename = _get_checkpoint_filename(filepattern)
+  if filename is None:
+    raise ValueError("Couldn't find 'checkpoint' file or checkpoints in "
+                     "given directory %s" % filepattern)
+  return train.NewCheckpointReader(filename)
+
+
+def load_variable(checkpoint_dir, name):
+  """Returns a Tensor with the contents of the given variable in the checkpoint.
+
+  Args:
+    checkpoint_dir: Directory with checkpoints file or path to checkpoint.
+    name: Name of the tensor to return.
+
+  Returns:
+    `Tensor` object.
+  """
+  # TODO(b/29227106): Fix this in the right place and remove this.
+  if name.endswith(":0"):
+    name = name[:-2]
+  reader = load_checkpoint(checkpoint_dir)
+  return reader.get_tensor(name)
+
+
+def list_variables(checkpoint_dir):
+  """Returns list of all variables in the latest checkpoint.
+
+  Args:
+    checkpoint_dir: Directory with checkpoints file or path to checkpoint.
+
+  Returns:
+    List of tuples `(name, shape)`.
+  """
+  reader = load_checkpoint(checkpoint_dir)
+  variable_map = reader.get_variable_to_shape_map()
+  names = sorted(variable_map.keys())
+  result = []
+  for name in names:
+    result.append((name, variable_map[name]))
+  return result
+
+
+# pylint: disable=protected-access
+# Currently variable_scope doesn't provide very good APIs to access
+# all variables under scope and retrieve and check existing scopes.
+# TODO(ipolosukhin): Refactor variable_scope module to provide nicer APIs.
+
+
+def _set_checkpoint_initializer(variable, file_pattern, tensor_name, slice_spec,
+                                name="checkpoint_initializer"):
+  """Sets variable initializer to assign op form value in checkpoint's tensor.
+
+  Args:
+    variable: `Variable` object.
+    file_pattern: string, where to load checkpoints from.
+    tensor_name: Name of the `Tensor` to load from checkpoint reader.
+    slice_spec: Slice specification for loading partitioned variables.
+    name: Name of the operation.
+  """
+  base_type = variable.dtype.base_dtype
+  restore_op = gen_io_ops._restore_slice(
+      file_pattern,
+      tensor_name,
+      slice_spec,
+      base_type,
+      preferred_shard=-1,
+      name=name)
+  variable._initializer_op = state_ops.assign(variable, restore_op)
+
+
+def _set_variable_or_list_initializer(variable_or_list, file_pattern,
+                                      tensor_name):
+  if isinstance(variable_or_list, (list, tuple)):
+    # A set of slices.
+    slice_name = None
+    for v in variable_or_list:
+      if slice_name is None:
+        slice_name = v._save_slice_info.full_name
+      elif slice_name != v._save_slice_info.full_name:
+        raise ValueError("Slices must all be from the same tensor: %s != %s" %
+                         (slice_name, v._save_slice_info.full_name))
+      _set_checkpoint_initializer(v, file_pattern, tensor_name,
+                                  v._save_slice_info.spec)
+  else:
+    _set_checkpoint_initializer(variable_or_list, file_pattern, tensor_name, "")
+
+
+def init_from_checkpoint(checkpoint_dir, assignment_map):
+  """Using assingment map initializes current variables with loaded tensors.
+
+  Note: This overrides default initialization ops of specified variables and
+  redefines dtype.
+
+  Assignment map supports following syntax:
+    `'checkpoint_scope_name/': 'scope_name/'` - will load all variables in
+      current `scope_name` from `checkpoint_scope_name` with matching variable
+      names.
+    `'checkpoint_scope_name/some_other_variable': 'scope_name/variable_name'` -
+      will initalize `scope_name/variable_name` variable
+      from `checkpoint_scope_name/some_other_variable`.
+    `'scope_variable_name': variable` - will initialize given `tf.Variable`
+      object with variable from the checkpoint.
+    `'scope_variable_name': list(variable)` - will initialize list of
+      partitioned variables with variable from the checkpoint.
+    `'scope_name/': '/'` - will load all variables in current `scope_name` from
+      checkpoint's root (e.g. no scope).
+
+  Supports loading into partitioned variables, which are represented as
+  '<variable>/part_<part #>'.
+
+  Example:
+  ```python
+    # Create variables.
+    with tf.variable_scope('test'):
+      m = tf.get_variable('my_var')
+    with tf.variable_scope('test2'):
+      var2 = tf.get_variable('my_var')
+    var3 = tf.get_variable(name="my1", shape=[100, 100],
+                           partitioner=lambda shape, dtype: [5, 1])
+    ...
+    # Specify which variables to intialize from checkpoint.
+    init_from_checkpoint(checkpoint_dir, {
+      'some_var': 'test/my_var',
+      'some_scope/': 'test2/'})
+    ...
+    # Or use `Variable` objects to identify what to initialize.
+    init_from_checkpoint(checkpoint_dir, {
+      'some_scope/var2': var2,
+    })
+    # Initialize partitioned variables
+    init_from_checkpoint(checkpoint_dir, {
+      'some_var_from_ckpt': 'part_var',
+    })
+    # Or specifying the list of `Variable` objects.
+    init_from_checkpoint(checkpoint_dir, {
+      'some_var_from_ckpt': var3._get_variable_list(),
+    })
+    ...
+    # Initialize variables as usual.
+    session.run(tf.get_all_variables())
+  ```
+
+  Args:
+    checkpoint_dir: Directory with checkpoints file or path to checkpoint.
+    assignment_map: Dict, where keys are names of the variables in the
+      checkpoint and values are current variables or names of current variables
+      (in default graph).
+
+  Raises:
+    tf.errors.OpError: If missing checkpoints or tensors in checkpoints.
+    ValueError: If missing variables in current graph.
+  """
+  filepattern = _get_checkpoint_filename(checkpoint_dir)
+  reader = load_checkpoint(checkpoint_dir)
+  variable_map = reader.get_variable_to_shape_map()
+  for tensor_name_in_ckpt, current_var_or_name in six.iteritems(assignment_map):
+    var = None
+    # Check if this is Variable object or list of Variable objects (in case of
+    # partitioned variables).
+    is_var = lambda x: isinstance(x, variables.Variable)
+    if is_var(current_var_or_name) or (
+        isinstance(current_var_or_name, list)
+        and all(is_var(v) for v in current_var_or_name)):
+      var = current_var_or_name
+    else:
+      var_scope = vs._get_default_variable_store()
+      # Check if this variable is in var_store.
+      var = var_scope._vars.get(current_var_or_name, None)
+      # Also check if variable is partitioned as list.
+      if var is None:
+        if current_var_or_name + "/part_0" in var_scope._vars:
+          var = []
+          i = 0
+          while current_var_or_name + "/part_%d" % i in var_scope._vars:
+            var.append(var_scope._vars[current_var_or_name + "/part_%d" % i])
+            i += 1
+    if var is not None:
+      # If 1 to 1 mapping was provided, find variable in the checkpoint.
+      if tensor_name_in_ckpt not in variable_map:
+        raise ValueError("Tensor %s is not found in %s checkpoint" % (
+            tensor_name_in_ckpt, checkpoint_dir
+        ))
+      if is_var(var):
+        # Additional at-call-time checks.
+        if not var.get_shape().is_compatible_with(
+            variable_map[tensor_name_in_ckpt]):
+          raise ValueError(
+              "Shape of variable %s (%s) doesn't match with shape of "
+              "tensor %s (%s) from checkpoint reader." % (
+                  var.name, str(var.get_shape()),
+                  tensor_name_in_ckpt, str(variable_map[tensor_name_in_ckpt])
+              ))
+        var_name = var.name
+      else:
+        var_name = ",".join([v.name for v in var])
+      _set_variable_or_list_initializer(var, filepattern, tensor_name_in_ckpt)
+      logging.info("Initialize variable %s from checkpoint %s with %s" % (
+          var_name, checkpoint_dir, tensor_name_in_ckpt
+      ))
+    else:
+      scopes = ""
+      # TODO(vihanjain): Support list of 'current_var_or_name' here.
+      if "/" in current_var_or_name:
+        scopes = current_var_or_name[:current_var_or_name.rindex("/")]
+      if not tensor_name_in_ckpt.endswith("/"):
+        raise ValueError(
+            "Assignment map with scope only name {} should map to scope only "
+            "{}. Should be 'scope/': 'other_scope/'.".format(
+                scopes, tensor_name_in_ckpt))
+      # If scope to scope mapping was provided, find all variables in the scope.
+      for var_name in var_scope._vars:
+        if var_name.startswith(scopes):
+          # Lookup name with specified prefix and suffix from current variable.
+          # If tensor_name given is '/' (root), don't use it for full name.
+          if tensor_name_in_ckpt != "/":
+            full_tensor_name = tensor_name_in_ckpt + var_name[len(scopes) + 1:]
+          else:
+            full_tensor_name = var_name[len(scopes) + 1:]
+          if full_tensor_name not in variable_map:
+            raise ValueError(
+                "Tensor %s (%s in %s) is not found in %s checkpoint" % (
+                    full_tensor_name, var_name[len(scopes) + 1:],
+                    tensor_name_in_ckpt, checkpoint_dir
+                ))
+          var = var_scope._vars[var_name]
+          _set_variable_or_list_initializer(var, filepattern, full_tensor_name)
+          logging.info("Initialize variable %s from checkpoint %s with %s" % (
+              var_name, checkpoint_dir, full_tensor_name
+          ))
+# pylint: enable=protected-access
diff --git a/tensorflow/contrib/learn/python/learn/utils/checkpoints_test.py b/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
similarity index 65%
rename from tensorflow/contrib/learn/python/learn/utils/checkpoints_test.py
rename to tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
index 47497a629e1..73a4a87dcca 100644
--- a/tensorflow/contrib/learn/python/learn/utils/checkpoints_test.py
+++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
@@ -23,8 +23,6 @@ import os
 
 import tensorflow as tf
 
-from tensorflow.contrib.learn.python.learn.utils import checkpoints
-
 
 def _create_checkpoints(sess, checkpoint_dir):
   checkpoint_prefix = os.path.join(checkpoint_dir, "model")
@@ -45,15 +43,14 @@ def _create_checkpoints(sess, checkpoint_dir):
 def _create_partition_checkpoints(sess, checkpoint_dir):
   checkpoint_prefix = os.path.join(checkpoint_dir, "model")
   checkpoint_state_name = "checkpoint"
-  # TODO(ipolosukhin): Enable this when get_variable partitioning works.
-#   v1 = tf.get_variable("var1", [100, 100],
-#                        partitioner=tf.variable_axis_size_partitioner(axis=0,
-#                                                         max_shard_bytes=512))
-  v1 = tf.create_partitioned_variables(
-      shape=[100, 100], slicing=[5, 1], name="var1",
-      initializer=tf.truncated_normal_initializer(0.5))
+  v1 = tf.get_variable(
+      name="var1",
+      shape=[100, 100],
+      initializer=tf.truncated_normal_initializer(0.5),
+      partitioner=tf.min_max_variable_partitioner(max_partitions=5, axis=0,
+                                                  min_slice_size=8 << 10))
   sess.run(tf.initialize_all_variables())
-  v1_value = sess.run(v1)
+  v1_value = sess.run(v1._get_variable_list())
   saver = tf.train.Saver()
   saver.save(sess, checkpoint_prefix, global_step=0,
              latest_filename=checkpoint_state_name)
@@ -65,30 +62,36 @@ class CheckpointsTest(tf.test.TestCase):
   def testNoCheckpoints(self):
     checkpoint_dir = self.get_temp_dir() + "/no_checkpoints"
     with self.assertRaises(tf.errors.OpError):
-      self.assertAllEqual(checkpoints.load_variable(checkpoint_dir, "var1"), [])
+      self.assertAllEqual(tf.contrib.framework.load_variable(
+          checkpoint_dir, "var1"), [])
 
   def testNoTensor(self):
     checkpoint_dir = self.get_temp_dir()
     with self.test_session() as session:
       _, _, _, _ = _create_checkpoints(session, checkpoint_dir)
     with self.assertRaises(tf.errors.OpError):
-      self.assertAllEqual(checkpoints.load_variable(checkpoint_dir, "var5"), [])
+      self.assertAllEqual(tf.contrib.framework.load_variable(
+          checkpoint_dir, "var5"), [])
 
   def testGetTensor(self):
     checkpoint_dir = self.get_temp_dir()
     with self.test_session() as session:
       v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)
-    self.assertAllEqual(checkpoints.load_variable(checkpoint_dir, "var1"), v1)
-    self.assertAllEqual(checkpoints.load_variable(checkpoint_dir, "var2"), v2)
-    self.assertAllEqual(checkpoints.load_variable(checkpoint_dir, "var3"), v3)
+    self.assertAllEqual(tf.contrib.framework.load_variable(
+        checkpoint_dir, "var1"), v1)
+    self.assertAllEqual(tf.contrib.framework.load_variable(
+        checkpoint_dir, "var2"), v2)
+    self.assertAllEqual(tf.contrib.framework.load_variable(
+        checkpoint_dir, "var3"), v3)
     self.assertAllEqual(
-        checkpoints.load_variable(checkpoint_dir, "useful_scope/var4"), v4)
+        tf.contrib.framework.load_variable(
+            checkpoint_dir, "useful_scope/var4"), v4)
 
   def testGetAllVariables(self):
     checkpoint_dir = self.get_temp_dir()
     with self.test_session() as session:
       _create_checkpoints(session, checkpoint_dir)
-    self.assertEqual(checkpoints.list_variables(checkpoint_dir),
+    self.assertEqual(tf.contrib.framework.list_variables(checkpoint_dir),
                      [("useful_scope/var4", [9, 9]),
                       ("var1", [1, 10]),
                       ("var2", [10, 10]),
@@ -110,13 +113,13 @@ class CheckpointsTest(tf.test.TestCase):
               my4 = tf.get_variable("var4", [9, 9])
         my3 = tf.get_variable("my3", [100, 100])
 
-        checkpoints.init_from_checkpoint(checkpoint_dir, {
-            "some_scope/my1": "var1",
-            "some_scope/some_other_scope/other_useful_scope/": "useful_scope/",
+        tf.contrib.framework.init_from_checkpoint(checkpoint_dir, {
+            "var1": "some_scope/my1",
+            "useful_scope/": "some_scope/some_other_scope/other_useful_scope/",
         })
-        checkpoints.init_from_checkpoint(checkpoint_dir, {
-            "some_scope/some_other_scope/my2": "var2",
-            my3: "var3",
+        tf.contrib.framework.init_from_checkpoint(checkpoint_dir, {
+            "var2": "some_scope/some_other_scope/my2",
+            "var3": my3,
         })
 
         session.run(tf.initialize_all_variables())
@@ -143,8 +146,8 @@ class CheckpointsTest(tf.test.TestCase):
           with tf.variable_scope("useful_scope"):
             my4 = tf.get_variable("var4", [9, 9])
 
-        checkpoints.init_from_checkpoint(checkpoint_dir, {
-            "some_scope/": "/",
+        tf.contrib.framework.init_from_checkpoint(checkpoint_dir, {
+            "/": "some_scope/",
         })
 
         session.run(tf.initialize_all_variables())
@@ -162,23 +165,40 @@ class CheckpointsTest(tf.test.TestCase):
     with tf.Graph().as_default() as g:
       with self.test_session(graph=g) as session:
         with tf.variable_scope("some_scope"):
-          # TODO(ipolosukhin): Enable this when get_variable partitioning works.
-          # Currently get_variable with partitioner doesn't return Variable,
-          # but returns a concat op.
-#           my1 = tf.get_variable(
-#               "my1", [100, 100],
-#               partitioner=tf.variable_axis_size_partitioner(axis=0,
-#                                                          max_shard_bytes=100))
-          my1 = tf.create_partitioned_variables(
-              shape=[100, 100], slicing=[5, 1], name="my1",
-              initializer=tf.truncated_normal_initializer(0.5))
+          my1 = tf.get_variable(
+              name="my1",
+              shape=[100, 100],
+              initializer=tf.truncated_normal_initializer(0.5),
+              partitioner=tf.min_max_variable_partitioner(
+                  max_partitions=5, axis=0, min_slice_size=8 << 10))
+          my1_var_list = my1._get_variable_list()
 
-        checkpoints.init_from_checkpoint(checkpoint_dir, {
-            "some_scope/my1": "var1",
+        tf.contrib.framework.init_from_checkpoint(checkpoint_dir, {
+            "var1": "some_scope/my1",
         })
 
         session.run(tf.initialize_all_variables())
-        my1_values = session.run(my1)
+        my1_values = session.run(my1_var_list)
+        self.assertAllEqual(my1_values, v1)
+
+    # New graph and session.
+    with tf.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        with tf.variable_scope("some_scope"):
+          my1 = tf.get_variable(
+              name="my1",
+              shape=[100, 100],
+              initializer=tf.truncated_normal_initializer(0.5),
+              partitioner=tf.min_max_variable_partitioner(
+                  max_partitions=5, axis=0, min_slice_size=8 << 10))
+          my1_var_list = my1._get_variable_list()
+
+        tf.contrib.framework.init_from_checkpoint(checkpoint_dir, {
+            "var1": my1_var_list,
+        })
+
+        session.run(tf.initialize_all_variables())
+        my1_values = session.run(my1_var_list)
         self.assertAllEqual(my1_values, v1)
 
   def testInitFromCheckpointMissing(self):
@@ -196,33 +216,33 @@ class CheckpointsTest(tf.test.TestCase):
 
         # No directory.
         with self.assertRaises(tf.errors.OpError):
-          checkpoints.init_from_checkpoint("no_dir", {
-              "some_scope/my1": "var1"})
+          tf.contrib.framework.init_from_checkpoint("no_dir", {
+              "var1": "some_scope/my1"})
 
         # No variable in checkpoint.
         with self.assertRaises(ValueError):
-          checkpoints.init_from_checkpoint(checkpoint_dir, {
-              "some_scope/my1": "no_var"})
+          tf.contrib.framework.init_from_checkpoint(checkpoint_dir, {
+              "no_var": "some_scope/my1"})
 
         # No variable in the graph.
         with self.assertRaises(ValueError):
-          checkpoints.init_from_checkpoint(checkpoint_dir, {
-              "some_scope/no_var": "var3"})
+          tf.contrib.framework.init_from_checkpoint(checkpoint_dir, {
+              "var3": "some_scope/no_var"})
 
         # Shape mismatch.
         with self.assertRaises(ValueError):
-          checkpoints.init_from_checkpoint(checkpoint_dir, {
-              "some_scope/my1": "var1"})
+          tf.contrib.framework.init_from_checkpoint(checkpoint_dir, {
+              "var1": "some_scope/my1"})
 
         # Variable 'my1' and 'my2' are missing in given checkpoint scope.
         with self.assertRaises(ValueError):
-          checkpoints.init_from_checkpoint(checkpoint_dir, {
-              "some_scope/": "useful_scope/"})
+          tf.contrib.framework.init_from_checkpoint(checkpoint_dir, {
+              "useful_scope/": "some_scope/"})
 
         # Mapping is not to scope name.
         with self.assertRaises(ValueError):
-          checkpoints.init_from_checkpoint(checkpoint_dir, {
-              "some_scope/": "useful_scope"})
+          tf.contrib.framework.init_from_checkpoint(checkpoint_dir, {
+              "useful_scope": "some_scope/"})
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/contrib/framework/python/framework/deprecation.py b/tensorflow/contrib/framework/python/framework/deprecation.py
index f3a2ce97d77..10d8f26c837 100644
--- a/tensorflow/contrib/framework/python/framework/deprecation.py
+++ b/tensorflow/contrib/framework/python/framework/deprecation.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import inspect
 import re
 
 from tensorflow.python.platform import tf_logging as logging
@@ -34,43 +36,77 @@ def _get_qualified_name(function):
   return function.__name__
 
 
-def _add_deprecation_to_docstring(doc, date, instructions):
+def _add_deprecation_to_docstring(
+    doc, instructions, no_doc_str, suffix_str, notice):
   """Adds a deprecation notice to a docstring."""
-  lines = doc.splitlines()
+  if not doc:
+    lines = [no_doc_str]
+  else:
+    lines = doc.splitlines()
+    lines[0] += ' ' + suffix_str
 
-  lines[0] += ' (deprecated)'
-
-  notice = [
-      '',
-      'THIS FUNCTION IS DEPRECATED. It will be removed after %s.' % date,
-      'Instructions for updating:',
-      '%s' % instructions,
-  ]
+  notice = [''] + notice + [instructions]
 
   if len(lines) > 1:
     # Make sure that we keep our distance from the main body
     if lines[1].strip():
-      notice += ['']
+      notice.append('')
 
-    lines = [lines[0]] + notice + lines[1:]
+    lines[1:1] = notice
   else:
     lines += notice
 
   return '\n'.join(lines)
 
 
+def _add_deprecated_function_notice_to_docstring(doc, date, instructions):
+  """Adds a deprecation notice to a docstring for deprecated functions."""
+  return _add_deprecation_to_docstring(
+      doc, instructions,
+      'DEPRECATED FUNCTION',
+      '(deprecated)', [
+          'THIS FUNCTION IS DEPRECATED. It will be removed after %s.' % date,
+          'Instructions for updating:'])
+
+
+def _add_deprecated_arg_notice_to_docstring(doc, date, instructions):
+  """Adds a deprecation notice to a docstring for deprecated arguments."""
+  return _add_deprecation_to_docstring(
+      doc, instructions,
+      'DEPRECATED FUNCTION ARGUMENTS',
+      '(deprecated arguments)', [
+          'SOME ARGUMENTS ARE DEPRECATED. '
+          'They will be removed after %s.' % date,
+          'Instructions for updating:'])
+
+
+def _validate_deprecation_args(date, instructions):
+  if not date:
+    raise ValueError('Tell us what date this will be deprecated!')
+  if not re.match(r'20\d\d-[01]\d-[0123]\d', date):
+    raise ValueError('Date must be YYYY-MM-DD.')
+  if not instructions:
+    raise ValueError('Don\'t deprecate things without conversion instructions!')
+
+
+def _validate_callable(func, decorator_name):
+  if not hasattr(func, '__call__'):
+    raise ValueError(
+        '%s is not a function. If this is a property, '
+        'apply @%s after @property.' % (func, decorator_name))
+
+
 def deprecated(date, instructions):
   """Decorator for marking functions or methods deprecated.
 
-  This decorator adds a deprecation warning to a function's docstring. It has
-  the following format:
+  This decorator logs a deprecation warning whenever the decorated function is
+  called. It has the following format:
 
     <function> (from <module>) is deprecated and will be removed after <date>.
     Instructions for updating:
     <instructions>
 
-  whenever the decorated function is called. <function> will include the class
-  name if it is a method.
+  <function> will include the class name if it is a method.
 
   It also edits the docstring of the function: ' (deprecated)' is appended
   to the first line of the docstring and a deprecation notice is prepended
@@ -88,24 +124,73 @@ def deprecated(date, instructions):
   Raises:
     ValueError: If date is not in ISO 8601 format, or instructions are empty.
   """
-  if not date:
-    raise ValueError('Tell us what date this will be deprecated!')
-  if not re.match(r'20\d\d-[01]\d-[0123]\d', date):
-    raise ValueError('Date must be YYYY-MM-DD.')
-  if not instructions:
-    raise ValueError('Don\'t deprecate things without conversion instructions!')
+  _validate_deprecation_args(date, instructions)
 
   def deprecated_wrapper(func):
     """Deprecation wrapper."""
+    _validate_callable(func, 'deprecated')
+    @functools.wraps(func)
     def new_func(*args, **kwargs):
-      logging.warn('%s (from %s) is deprecated and will be removed after %s.\n'
-                   'Instructions for updating:\n%s',
-                   _get_qualified_name(func), func.__module__,
-                   date, instructions)
+      logging.warning(
+          '%s (from %s) is deprecated and will be removed after %s.\n'
+          'Instructions for updating:\n%s',
+          _get_qualified_name(func), func.__module__, date, instructions)
       return func(*args, **kwargs)
-    new_func.__name__ = func.__name__
-    new_func.__doc__ = _add_deprecation_to_docstring(func.__doc__, date,
-                                                     instructions)
-    new_func.__dict__.update(func.__dict__)
+    new_func.__doc__ = _add_deprecated_function_notice_to_docstring(
+        func.__doc__, date, instructions)
+    return new_func
+  return deprecated_wrapper
+
+
+def deprecated_arg_values(date, instructions, **deprecated_kwargs):
+  """Decorator for marking specific function argument values as deprecated.
+
+  This decorator logs a deprecation warning whenever the decorated function is
+  called with the deprecated argument values. It has the following format:
+
+    Calling <function> (from <module>) with <arg>=<value> is deprecated and
+    will be removed after <date>. Instructions for updating:
+      <instructions>
+
+  <function> will include the class name if it is a method.
+
+  It also edits the docstring of the function: ' (deprecated arguments)' is
+  appended to the first line of the docstring and a deprecation notice is
+  prepended to the rest of the docstring.
+
+  Args:
+    date: String. The date the function is scheduled to be removed. Must be
+      ISO 8601 (YYYY-MM-DD).
+    instructions: String. Instructions on how to update code using the
+      deprecated function.
+    **deprecated_kwargs: The deprecated argument values.
+
+  Returns:
+    Decorated function or method.
+
+  Raises:
+    ValueError: If date is not in ISO 8601 format, or instructions are empty.
+  """
+  _validate_deprecation_args(date, instructions)
+  if not deprecated_kwargs:
+    raise ValueError('Specify which argument values are deprecated.')
+
+  def deprecated_wrapper(func):
+    """Deprecation decorator."""
+    _validate_callable(func, 'deprecated_arg_values')
+    @functools.wraps(func)
+    def new_func(*args, **kwargs):
+      """Deprecation wrapper."""
+      named_args = inspect.getcallargs(func, *args, **kwargs)
+      for arg_name, arg_value in deprecated_kwargs.items():
+        if arg_name in named_args and named_args[arg_name] == arg_value:
+          logging.warning(
+              'Calling %s (from %s) with %s=%s is deprecated and will be '
+              'removed after %s.\nInstructions for updating:\n%s',
+              _get_qualified_name(func), func.__module__,
+              arg_name, arg_value, date, instructions)
+      return func(*args, **kwargs)
+    new_func.__doc__ = _add_deprecated_arg_notice_to_docstring(
+        func.__doc__, date, instructions)
     return new_func
   return deprecated_wrapper
diff --git a/tensorflow/contrib/framework/python/framework/deprecation_test.py b/tensorflow/contrib/framework/python/framework/deprecation_test.py
new file mode 100644
index 00000000000..b9572d626a6
--- /dev/null
+++ b/tensorflow/contrib/framework/python/framework/deprecation_test.py
@@ -0,0 +1,488 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""tensor_util tests."""
+
+# pylint: disable=unused-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.contrib.framework.python.framework import deprecation
+from tensorflow.python.platform import tf_logging as logging
+
+
+class DeprecationTest(tf.test.TestCase):
+
+  def _assert_subset(self, expected_subset, actual_set):
+    self.assertTrue(
+        actual_set.issuperset(expected_subset),
+        msg="%s is not a superset of %s." % (actual_set, expected_subset))
+
+  def test_deprecated_illegal_args(self):
+    instructions = "This is how you update..."
+    with self.assertRaisesRegexp(ValueError, "date"):
+      deprecation.deprecated(None, instructions)
+    with self.assertRaisesRegexp(ValueError, "date"):
+      deprecation.deprecated("", instructions)
+    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
+      deprecation.deprecated("07-04-2016", instructions)
+    date = "2016-07-04"
+    with self.assertRaisesRegexp(ValueError, "instructions"):
+      deprecation.deprecated(date, None)
+    with self.assertRaisesRegexp(ValueError, "instructions"):
+      deprecation.deprecated(date, "")
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_static_fn_with_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated(date, instructions)
+    def _fn(arg0, arg1):
+      """fn doc.
+
+      Args:
+        arg0: Arg 0.
+        arg1: Arg 1.
+
+      Returns:
+        Sum of args.
+      """
+      return arg0 + arg1
+
+    # Assert function docs are properly updated.
+    self.assertEqual("_fn", _fn.__name__)
+    self.assertEqual(
+        "fn doc. (deprecated)"
+        "\n"
+        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nInstructions for updating:\n%s"
+        "\n"
+        "\n      Args:"
+        "\n        arg0: Arg 0."
+        "\n        arg1: Arg 1."
+        "\n"
+        "\n      Returns:"
+        "\n        Sum of args."
+        "\n      " % (date, instructions),
+        _fn.__doc__)
+
+    # Assert calling new fn issues log warning.
+    self.assertEqual(3, _fn(1, 2))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_static_fn_with_one_line_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated(date, instructions)
+    def _fn(arg0, arg1):
+      """fn doc."""
+      return arg0 + arg1
+
+    # Assert function docs are properly updated.
+    self.assertEqual("_fn", _fn.__name__)
+    self.assertEqual(
+        "fn doc. (deprecated)"
+        "\n"
+        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nInstructions for updating:\n%s" % (date, instructions),
+        _fn.__doc__)
+
+    # Assert calling new fn issues log warning.
+    self.assertEqual(3, _fn(1, 2))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_static_fn_no_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated(date, instructions)
+    def _fn(arg0, arg1):
+      return arg0 + arg1
+
+    # Assert function docs are properly updated.
+    self.assertEqual("_fn", _fn.__name__)
+    self.assertEqual(
+        "DEPRECATED FUNCTION"
+        "\n"
+        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nInstructions for updating:"
+        "\n%s" % (date, instructions),
+        _fn.__doc__)
+
+    # Assert calling new fn issues log warning.
+    self.assertEqual(3, _fn(1, 2))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_instance_fn_with_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    class _Object(object):
+
+      def __init(self):
+        pass
+
+      @deprecation.deprecated(date, instructions)
+      def _fn(self, arg0, arg1):
+        """fn doc.
+
+        Args:
+          arg0: Arg 0.
+          arg1: Arg 1.
+
+        Returns:
+          Sum of args.
+        """
+        return arg0 + arg1
+
+    # Assert function docs are properly updated.
+    self.assertEqual(
+        "fn doc. (deprecated)"
+        "\n"
+        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nInstructions for updating:\n%s"
+        "\n"
+        "\n        Args:"
+        "\n          arg0: Arg 0."
+        "\n          arg1: Arg 1."
+        "\n"
+        "\n        Returns:"
+        "\n          Sum of args."
+        "\n        " % (date, instructions),
+        getattr(_Object, "_fn").__doc__)
+
+    # Assert calling new fn issues log warning.
+    self.assertEqual(3, _Object()._fn(1, 2))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_instance_fn_with_one_line_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    class _Object(object):
+
+      def __init(self):
+        pass
+
+      @deprecation.deprecated(date, instructions)
+      def _fn(self, arg0, arg1):
+        """fn doc."""
+        return arg0 + arg1
+
+    # Assert function docs are properly updated.
+    self.assertEqual(
+        "fn doc. (deprecated)"
+        "\n"
+        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nInstructions for updating:\n%s" % (date, instructions),
+        getattr(_Object, "_fn").__doc__)
+
+    # Assert calling new fn issues log warning.
+    self.assertEqual(3, _Object()._fn(1, 2))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_instance_fn_no_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    class _Object(object):
+
+      def __init(self):
+        pass
+
+      @deprecation.deprecated(date, instructions)
+      def _fn(self, arg0, arg1):
+        return arg0 + arg1
+
+    # Assert function docs are properly updated.
+    self.assertEqual(
+        "DEPRECATED FUNCTION"
+        "\n"
+        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nInstructions for updating:"
+        "\n%s" % (date, instructions),
+        getattr(_Object, "_fn").__doc__)
+
+    # Assert calling new fn issues log warning.
+    self.assertEqual(3, _Object()._fn(1, 2))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_prop_wrong_order(self, mock_warning):
+
+    with self.assertRaisesRegexp(
+        ValueError, "apply @deprecated after @property"):
+      # pylint: disable=unused-variable
+
+      class _Object(object):
+
+        def __init(self):
+          pass
+
+        @deprecation.deprecated("2016-07-04", "Instructions.")
+        @property
+        def _prop(self):
+          return "prop_wrong_order"
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_prop_with_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    class _Object(object):
+
+      def __init(self):
+        pass
+
+      @property
+      @deprecation.deprecated(date, instructions)
+      def _prop(self):
+        """prop doc.
+
+        Returns:
+          String.
+        """
+        return "prop_with_doc"
+
+    # Assert function docs are properly updated.
+    self.assertEqual(
+        "prop doc. (deprecated)"
+        "\n"
+        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nInstructions for updating:"
+        "\n%s"
+        "\n"
+        "\n        Returns:"
+        "\n          String."
+        "\n        " % (date, instructions),
+        getattr(_Object, "_prop").__doc__)
+
+    # Assert calling new fn issues log warning.
+    self.assertEqual("prop_with_doc", _Object()._prop)
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_prop_no_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    class _Object(object):
+
+      def __init(self):
+        pass
+
+      @property
+      @deprecation.deprecated(date, instructions)
+      def _prop(self):
+        return "prop_no_doc"
+
+    # Assert function docs are properly updated.
+    self.assertEqual(
+        "DEPRECATED FUNCTION"
+        "\n"
+        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nInstructions for updating:"
+        "\n%s" % (date, instructions),
+        getattr(_Object, "_prop").__doc__)
+
+    # Assert calling new fn issues log warning.
+    self.assertEqual("prop_no_doc", _Object()._prop)
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+
+class DeprecatedArgsTest(tf.test.TestCase):
+
+  def _assert_subset(self, expected_subset, actual_set):
+    self.assertTrue(
+        actual_set.issuperset(expected_subset),
+        msg="%s is not a superset of %s." % (actual_set, expected_subset))
+
+  def test_deprecated_illegal_args(self):
+    instructions = "This is how you update..."
+    with self.assertRaisesRegexp(ValueError, "date"):
+      deprecation.deprecated_arg_values(
+          None, instructions, deprecated=True)
+    with self.assertRaisesRegexp(ValueError, "date"):
+      deprecation.deprecated_arg_values(
+          "", instructions, deprecated=True)
+    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
+      deprecation.deprecated_arg_values(
+          "07-04-2016", instructions, deprecated=True)
+    date = "2016-07-04"
+    with self.assertRaisesRegexp(ValueError, "instructions"):
+      deprecation.deprecated_arg_values(
+          date, None, deprecated=True)
+    with self.assertRaisesRegexp(ValueError, "instructions"):
+      deprecation.deprecated_arg_values(
+          date, "", deprecated=True)
+    with self.assertRaisesRegexp(ValueError, "argument", deprecated=True):
+      deprecation.deprecated_arg_values(
+          date, instructions)
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_static_fn_with_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated_arg_values(date, instructions, deprecated=True)
+    def _fn(arg0, arg1, deprecated=True):
+      """fn doc.
+
+      Args:
+        arg0: Arg 0.
+        arg1: Arg 1.
+        deprecated: Deprecated!
+
+      Returns:
+        Sum of args.
+      """
+      return arg0 + arg1 if deprecated else arg1 + arg0
+
+    # Assert function docs are properly updated.
+    self.assertEqual("_fn", _fn.__name__)
+    self.assertEqual(
+        "fn doc. (deprecated arguments)"
+        "\n"
+        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nInstructions for updating:\n%s"
+        "\n"
+        "\n      Args:"
+        "\n        arg0: Arg 0."
+        "\n        arg1: Arg 1."
+        "\n        deprecated: Deprecated!"
+        "\n"
+        "\n      Returns:"
+        "\n        Sum of args."
+        "\n      " % (date, instructions),
+        _fn.__doc__)
+
+    # Assert calling new fn with non-deprecated value logs nothing.
+    self.assertEqual(3, _fn(1, 2, deprecated=False))
+    self.assertEqual(0, mock_warning.call_count)
+
+    # Assert calling new fn with deprecated value issues log warning.
+    self.assertEqual(3, _fn(1, 2, deprecated=True))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+    # Assert calling new fn with default deprecated value issues log warning.
+    self.assertEqual(3, _fn(1, 2))
+    self.assertEqual(2, mock_warning.call_count)
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_static_fn_with_one_line_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated_arg_values(date, instructions, deprecated=True)
+    def _fn(arg0, arg1, deprecated=True):
+      """fn doc."""
+      return arg0 + arg1 if deprecated else arg1 + arg0
+
+    # Assert function docs are properly updated.
+    self.assertEqual("_fn", _fn.__name__)
+    self.assertEqual(
+        "fn doc. (deprecated arguments)"
+        "\n"
+        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nInstructions for updating:\n%s" % (date, instructions),
+        _fn.__doc__)
+
+    # Assert calling new fn with non-deprecated value logs nothing.
+    self.assertEqual(3, _fn(1, 2, deprecated=False))
+    self.assertEqual(0, mock_warning.call_count)
+
+    # Assert calling new fn with deprecated value issues log warning.
+    self.assertEqual(3, _fn(1, 2, deprecated=True))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+    # Assert calling new fn with default deprecated value issues log warning.
+    self.assertEqual(3, _fn(1, 2))
+    self.assertEqual(2, mock_warning.call_count)
+
+  @tf.test.mock.patch.object(logging, "warning", autospec=True)
+  def test_static_fn_no_doc(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated_arg_values(date, instructions, deprecated=True)
+    def _fn(arg0, arg1, deprecated=True):
+      return arg0 + arg1 if deprecated else arg1 + arg0
+
+    # Assert function docs are properly updated.
+    self.assertEqual("_fn", _fn.__name__)
+    self.assertEqual(
+        "DEPRECATED FUNCTION ARGUMENTS"
+        "\n"
+        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nInstructions for updating:"
+        "\n%s" % (date, instructions),
+        _fn.__doc__)
+
+    # Assert calling new fn with non-deprecated value logs nothing.
+    self.assertEqual(3, _fn(1, 2, deprecated=False))
+    self.assertEqual(0, mock_warning.call_count)
+
+    # Assert calling new fn issues log warning.
+    self.assertEqual(3, _fn(1, 2, deprecated=True))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
+    self._assert_subset(set([date, instructions]), set(args[1:]))
+
+    # Assert calling new fn with default deprecated value issues log warning.
+    self.assertEqual(3, _fn(1, 2))
+    self.assertEqual(2, mock_warning.call_count)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/framework/python/ops/sampling_ops.py b/tensorflow/contrib/framework/python/ops/sampling_ops.py
index d44fe3b3f6c..1d4fed9bd41 100644
--- a/tensorflow/contrib/framework/python/ops/sampling_ops.py
+++ b/tensorflow/contrib/framework/python/ops/sampling_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.training import input as input_ops
 from tensorflow.python.training import queue_runner
 
@@ -34,10 +35,8 @@ __all__ = ['stratified_sample',
            'stratified_sample_unknown_dist',]
 
 
-# TODO(joelshor): Use an exponential-moving-average to estimate the initial
-# class distribution and remove the requirement that it be provided.
-def stratified_sample(tensors, labels, init_probs, target_probs, batch_size,
-                      enqueue_many=False, queue_capacity=16,
+def stratified_sample(tensors, labels, target_probs, batch_size,
+                      init_probs=None, enqueue_many=False, queue_capacity=16,
                       threads_per_queue=1, name=None):
   """Stochastically creates batches based on per-class probabilities.
 
@@ -52,11 +51,12 @@ def stratified_sample(tensors, labels, init_probs, target_probs, batch_size,
         batch, according to enqueue_many.
     labels: Tensor for label of data. Label is a single integer or a batch,
         depending on enqueue_many. It is not a one-hot vector.
-    init_probs: Class proportions in the data. An object whose type has a
-        registered Tensor conversion function.
     target_probs: Target class proportions in batch. An object whose type has a
         registered Tensor conversion function.
     batch_size: Size of batch to be returned.
+    init_probs: Class proportions in the data. An object whose type has a
+        registered Tensor conversion function, or `None` for estimating the
+        initial distribution.
     enqueue_many: Bool. If true, interpret input tensors as having a batch
         dimension.
     queue_capacity: Capacity of the large queue that holds input examples.
@@ -81,10 +81,9 @@ def stratified_sample(tensors, labels, init_probs, target_probs, batch_size,
     data, label = data_provider.Get(['data', 'label'])
 
     # Get stratified batch according to per-class probabilities.
-    init_probs = [1.0/NUM_CLASSES for _ in range(NUM_CLASSES)]
     target_probs = [...distribution you want...]
     [data_batch], labels = tf.contrib.framework.sampling_ops.stratified_sample(
-        [data], label, init_probs, target_probs)
+        [data], label, target_probs)
 
     # Run batch through network.
     ...
@@ -92,22 +91,34 @@ def stratified_sample(tensors, labels, init_probs, target_probs, batch_size,
   with ops.op_scope(tensors + [labels], name, 'stratified_sample'):
     tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors)
     labels = ops.convert_to_tensor(labels)
-    init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32)
     target_probs = ops.convert_to_tensor(target_probs, dtype=dtypes.float32)
     # Reduce the case of a single example to that of a batch of size 1.
     if not enqueue_many:
       tensor_list = [array_ops.expand_dims(tensor, 0) for tensor in tensor_list]
       labels = array_ops.expand_dims(labels, 0)
 
+    # If `init_probs` is `None`, set up online estimation of data distribution.
+    if init_probs is None:
+      # We use `target_probs` to get the number of classes, so its shape must be
+      # fully defined at graph construction time.
+      target_probs.get_shape().assert_is_fully_defined()
+      init_probs = _estimate_data_distribution(
+          labels, target_probs.get_shape().num_elements())
+    else:
+      init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32)
+
     # Validate that input is consistent.
     tensor_list, labels, [init_probs, target_probs] = _verify_input(
         tensor_list, labels, [init_probs, target_probs])
 
     # Check that all zero initial probabilities also have zero target
     # probabilities.
-    assert_op = logging_ops.Assert(math_ops.reduce_all(math_ops.logical_or(
-        math_ops.not_equal(init_probs, 0),
-        math_ops.equal(target_probs, 0))), [init_probs, target_probs])
+    assert_op = logging_ops.Assert(
+        math_ops.reduce_all(math_ops.logical_or(
+            math_ops.not_equal(init_probs, 0),
+            math_ops.equal(target_probs, 0))),
+        ['All classes with zero initial probability must also have zero target '
+         'probability: ', init_probs, target_probs])
     init_probs = control_flow_ops.with_dependencies([assert_op], init_probs)
 
     # Calculate acceptance sampling probabilities.
@@ -212,6 +223,40 @@ def stratified_sample_unknown_dist(tensors, labels, probs, batch_size,
         per_class_queues, probs, batch_size)
 
 
+def _estimate_data_distribution(labels, num_classes):
+  """Estimate data distribution as labels are seen."""
+  # Variable to track running count of classes. Add 1 to avoid division-by-zero,
+  # and to guarantee that calculation of acceptance probabilities is (mostly)
+  # correct.
+  num_examples_per_class_seen = variables.Variable(
+      initial_value=[1] * num_classes, trainable=False, name='class_count',
+      dtype=dtypes.int64)
+
+  # Update the class-count based on what labels are seen in batch.
+  num_examples_per_class_seen = num_examples_per_class_seen.assign_add(
+      math_ops.reduce_sum(array_ops.one_hot(labels, num_classes,
+                                            dtype=dtypes.int64), 0))
+
+  # Normalize count into a probability.
+  # NOTE: Without the `+= 0` line below, the test
+  # `testMultiThreadedEstimateDataDistribution` fails. The reason is that
+  # before this line, `num_examples_per_class_seen` is a Tensor that shares a
+  # buffer with an underlying `ref` object. When the `ref` is changed by another
+  # thread, `num_examples_per_class_seen` changes as well. Since this can happen
+  # in the middle of the normalization computation, we get probabilities that
+  # are very far from summing to one. Adding `+= 0` copies the contents of the
+  # tensor to a new buffer, which will be consistent from the start to the end
+  # of the normalization computation.
+  num_examples_per_class_seen += 0
+  init_prob_estimate = math_ops.truediv(
+      num_examples_per_class_seen,
+      math_ops.reduce_sum(num_examples_per_class_seen))
+
+  # Must return float32 (not float64) to agree with downstream `_verify_input`
+  # checks.
+  return math_ops.cast(init_prob_estimate, dtypes.float32)
+
+
 def _verify_input(tensor_list, labels, probs_list):
   """Verify that batched inputs are well-formed."""
   checked_probs_list = []
diff --git a/tensorflow/contrib/framework/python/ops/sampling_ops_test.py b/tensorflow/contrib/framework/python/ops/sampling_ops_test.py
index 4ec7d86ec82..35b56bdfa1a 100644
--- a/tensorflow/contrib/framework/python/ops/sampling_ops_test.py
+++ b/tensorflow/contrib/framework/python/ops/sampling_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 import tensorflow as tf
+from tensorflow.python.platform import tf_logging as logging
 
 
 class SamplingOpsTest(tf.test.TestCase):
@@ -33,15 +34,22 @@ class SamplingOpsTest(tf.test.TestCase):
 
     # Curry the rejection sampler so we can easily run the same tests on both
     # stratified_sample and stratified_sample_unknown_dist.
-    def curried_sampler(val, lbls, probs, batch, enqueue_many=True):
+    def curried_sampler(tensors, labels, probs, batch_size, enqueue_many=True):
       return tf.contrib.framework.sampling_ops.stratified_sample(
-          val, lbls, initial_p, probs, batch, enqueue_many=enqueue_many)
+          tensors=tensors,
+          labels=labels,
+          target_probs=probs,
+          batch_size=batch_size,
+          init_probs=initial_p,
+          enqueue_many=enqueue_many)
+
     samplers = [
         tf.contrib.framework.sampling_ops.stratified_sample_unknown_dist,
         curried_sampler,
     ]
 
     for sampler in samplers:
+      logging.info('Now testing `%s`', sampler.__class__.__name__)
       # Label must have only batch dimension if enqueue_many is True.
       with self.assertRaises(ValueError):
         sampler(val, tf.zeros([]), probs, batch_size, enqueue_many=True)
@@ -70,20 +78,21 @@ class SamplingOpsTest(tf.test.TestCase):
 
       # Probabilities shape must be fully defined.
       with self.assertRaises(ValueError):
-        sampler(val, label, tf.placeholder(tf.float32, shape=[None]),
-                batch_size)
+        sampler(
+            val, label, tf.placeholder(
+                tf.float32, shape=[None]), batch_size)
 
     # In the rejection sampling case, make sure that probability lengths are
     # the same.
     with self.assertRaises(ValueError):
       tf.contrib.framework.sampling_ops.stratified_sample(
-          val, label, [.2] * 5, [.1] * 10, batch_size)
+          val, label, [.1] * 10, batch_size, init_probs=[.2] * 5)
 
     # In the rejection sampling case, make sure that zero initial probability
     # classes also have zero target probability.
     with self.assertRaises(ValueError):
       tf.contrib.framework.sampling_ops.stratified_sample(
-          val, label, [0, .5, .5], [.2, .4, .4], batch_size)
+          val, label, [.2, .4, .4], batch_size, init_probs=[0, .5, .5])
 
     # Probabilities must be 1D.
     with self.assertRaises(ValueError):
@@ -116,15 +125,17 @@ class SamplingOpsTest(tf.test.TestCase):
       # Run session that should fail.
       with self.test_session() as sess:
         with self.assertRaises(tf.errors.InvalidArgumentError):
-          sess.run([val_tf, lbl_tf], feed_dict={label_ph: illegal_label,
-                                                probs_ph: valid_probs})
+          sess.run([val_tf, lbl_tf],
+                   feed_dict={label_ph: illegal_label,
+                              probs_ph: valid_probs})
 
     for illegal_prob in illegal_probs:
       # Run session that should fail.
       with self.test_session() as sess:
         with self.assertRaises(tf.errors.InvalidArgumentError):
-          sess.run([prob_tf], feed_dict={label_ph: valid_labels,
-                                         probs_ph: illegal_prob})
+          sess.run([prob_tf],
+                   feed_dict={label_ph: valid_labels,
+                              probs_ph: illegal_prob})
 
   def batchingBehaviorHelper(self, sampler):
     batch_size = 20
@@ -152,15 +163,14 @@ class SamplingOpsTest(tf.test.TestCase):
     lbl_input_batch = tf.ones([], dtype=tf.int32)
     probs = np.array([0, 1, 0, 0, 0])
     batches = tf.contrib.framework.sampling_ops.stratified_sample(
-        val_input_batch, lbl_input_batch, probs, probs, batch_size)
+        val_input_batch, lbl_input_batch, probs, batch_size, init_probs=probs)
     batches += tf.contrib.framework.sampling_ops.stratified_sample(
-        val_input_batch, lbl_input_batch, probs, probs, batch_size)
+        val_input_batch, lbl_input_batch, probs, batch_size, init_probs=probs)
     batches += tf.contrib.framework.sampling_ops.stratified_sample_unknown_dist(
         val_input_batch, lbl_input_batch, probs, batch_size)
     batches += tf.contrib.framework.sampling_ops.stratified_sample_unknown_dist(
         val_input_batch, lbl_input_batch, probs, batch_size)
-    summary_op = tf.merge_summary(tf.get_collection(
-        tf.GraphKeys.SUMMARIES))
+    summary_op = tf.merge_summary(tf.get_collection(tf.GraphKeys.SUMMARIES))
 
     with self.test_session() as sess:
       coord = tf.train.Coordinator()
@@ -177,9 +187,15 @@ class SamplingOpsTest(tf.test.TestCase):
 
   def testRejectionBatchingBehavior(self):
     initial_p = [0, .3, 0, .7, 0]
+
     def curried_sampler(val, lbls, probs, batch, enqueue_many=True):
       return tf.contrib.framework.sampling_ops.stratified_sample(
-          val, lbls, initial_p, probs, batch, enqueue_many=enqueue_many)
+          val,
+          lbls,
+          probs,
+          batch,
+          init_probs=initial_p,
+          enqueue_many=enqueue_many)
 
     self.batchingBehaviorHelper(curried_sampler)
 
@@ -190,8 +206,7 @@ class SamplingOpsTest(tf.test.TestCase):
     lbl2 = 3
     # This cond allows the necessary class queues to be populated.
     label = tf.cond(
-        tf.greater(.5, tf.random_uniform([])),
-        lambda: tf.constant(lbl1),
+        tf.greater(.5, tf.random_uniform([])), lambda: tf.constant(lbl1),
         lambda: tf.constant(lbl2))
     val = [np.array([1, 4]) * label]
     probs = tf.placeholder(tf.float32, shape=[5])
@@ -225,7 +240,7 @@ class SamplingOpsTest(tf.test.TestCase):
   def testBatchDimensionNotRequired(self):
     classes = 5
     # Probs must be a tensor, since we pass it directly to _verify_input.
-    probs = tf.constant([1.0/classes] * classes)
+    probs = tf.constant([1.0 / classes] * classes)
 
     # Make sure that these vals/labels pairs don't throw any runtime exceptions.
     legal_input_pairs = [
@@ -243,16 +258,17 @@ class SamplingOpsTest(tf.test.TestCase):
     # Run graph to make sure there are no shape-related runtime errors.
     for vals, labels in legal_input_pairs:
       with self.test_session() as sess:
-        sess.run([val_tf, labels_tf], feed_dict={vals_ph: vals,
-                                                 labels_ph: labels})
+        sess.run([val_tf, labels_tf],
+                 feed_dict={vals_ph: vals,
+                            labels_ph: labels})
 
   def dataListHelper(self, sampler):
     batch_size = 20
     val_input_batch = [tf.zeros([2, 3, 4]), tf.ones([2, 4]), tf.ones(2) * 3]
     lbl_input_batch = tf.ones([], dtype=tf.int32)
     probs = np.array([0, 1, 0, 0, 0])
-    val_list, lbls = sampler(
-        val_input_batch, lbl_input_batch, probs, batch_size)
+    val_list, lbls = sampler(val_input_batch, lbl_input_batch, probs,
+                             batch_size)
 
     # Check output shapes.
     self.assertTrue(isinstance(val_list, list))
@@ -277,9 +293,16 @@ class SamplingOpsTest(tf.test.TestCase):
 
   def testRejectionDataListInput(self):
     initial_p = [0, 1, 0, 0, 0]
+
     def curried_sampler(val, lbls, probs, batch, enqueue_many=False):
       return tf.contrib.framework.sampling_ops.stratified_sample(
-          val, lbls, initial_p, probs, batch, enqueue_many=enqueue_many)
+          val,
+          lbls,
+          probs,
+          batch,
+          init_probs=initial_p,
+          enqueue_many=enqueue_many)
+
     self.dataListHelper(curried_sampler)
 
   def normalBehaviorHelper(self, sampler):
@@ -289,8 +312,7 @@ class SamplingOpsTest(tf.test.TestCase):
     lbl2 = 3
     # This cond allows the necessary class queues to be populated.
     label = tf.cond(
-        tf.greater(.5, tf.random_uniform([])),
-        lambda: tf.constant(lbl1),
+        tf.greater(.5, tf.random_uniform([])), lambda: tf.constant(lbl1),
         lambda: tf.constant(lbl2))
     val = [np.array([1, 4]) * label]
     probs = np.array([.8, 0, 0, .2, 0])
@@ -302,6 +324,9 @@ class SamplingOpsTest(tf.test.TestCase):
     data_l = []
     label_l = []
     with self.test_session() as sess:
+      # Need to initialize variables that keep running total of classes seen.
+      tf.initialize_all_variables().run()
+
       coord = tf.train.Coordinator()
       threads = tf.train.start_queue_runners(coord=coord)
 
@@ -329,7 +354,7 @@ class SamplingOpsTest(tf.test.TestCase):
     # is fixed, for a given implementation, this test will pass or fail 100% of
     # the time. This use of assertNear is to cover cases where someone changes
     # an implementation detail, which would cause the random behavior to differ.
-    self.assertNear(actual_lbl, expected_label, 3*lbl_std_dev_of_mean)
+    self.assertNear(actual_lbl, expected_label, 3 * lbl_std_dev_of_mean)
 
   def testNormalBehavior(self):
     self.normalBehaviorHelper(
@@ -337,10 +362,26 @@ class SamplingOpsTest(tf.test.TestCase):
 
   def testRejectionNormalBehavior(self):
     initial_p = [.7, 0, 0, .3, 0]
+
     def curried_sampler(val, lbls, probs, batch, enqueue_many=False):
       return tf.contrib.framework.sampling_ops.stratified_sample(
-          val, lbls, initial_p, probs, batch, enqueue_many=enqueue_many)
+          val,
+          lbls,
+          probs,
+          batch,
+          init_probs=initial_p,
+          enqueue_many=enqueue_many)
+
     self.normalBehaviorHelper(curried_sampler)
 
+  def testRejectionNormalBehaviorWithOnlineInitPEstimate(self):
+
+    def curried_sampler(val, lbls, probs, batch, enqueue_many=False):
+      return tf.contrib.framework.sampling_ops.stratified_sample(
+          val, lbls, probs, batch, init_probs=None, enqueue_many=enqueue_many)
+
+    self.normalBehaviorHelper(curried_sampler)
+
+
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/framework/python/ops/sampling_ops_threading_test.py b/tensorflow/contrib/framework/python/ops/sampling_ops_threading_test.py
new file mode 100644
index 00000000000..3812c3348c4
--- /dev/null
+++ b/tensorflow/contrib/framework/python/ops/sampling_ops_threading_test.py
@@ -0,0 +1,65 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# pylint: disable=unused-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+class SamplingOpsThreadingTest(tf.test.TestCase):
+
+  def testMultiThreadedEstimateDataDistribution(self):
+    num_classes = 10
+
+    # Set up graph.
+    tf.set_random_seed(1234)
+    label = tf.cast(tf.round(tf.random_uniform([1]) * num_classes), tf.int32)
+
+    prob_estimate = tf.contrib.framework.sampling_ops._estimate_data_distribution(  # pylint: disable=line-too-long
+        label, num_classes)
+    # Check that prob_estimate is well-behaved in a multithreaded context.
+    _, _, [prob_estimate] = tf.contrib.framework.sampling_ops._verify_input(
+        [], label, [prob_estimate])
+
+    # Use queues to run multiple threads over the graph, each of which
+    # fetches `prob_estimate`.
+    queue = tf.FIFOQueue(
+        capacity=25,
+        dtypes=[prob_estimate.dtype],
+        shapes=[prob_estimate.get_shape()])
+    enqueue_op = queue.enqueue([prob_estimate])
+    tf.train.add_queue_runner(tf.train.QueueRunner(queue, [enqueue_op] * 25))
+    out_tensor = queue.dequeue()
+
+    # Run the multi-threaded session.
+    with self.test_session() as sess:
+      # Need to initialize variables that keep running total of classes seen.
+      tf.initialize_all_variables().run()
+
+      coord = tf.train.Coordinator()
+      threads = tf.train.start_queue_runners(coord=coord)
+
+      for _ in range(25):
+        sess.run([out_tensor])
+
+      coord.request_stop()
+      coord.join(threads)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc b/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc
index b3addf5746c..e854292f9da 100644
--- a/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc
+++ b/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
 namespace tensorflow {
 REGISTER_OP("SparseFeatureCross")
@@ -31,6 +32,12 @@ REGISTER_OP("SparseFeatureCross")
     .Attr("dense_types: list({int64, string}) >= 0")
     .Attr("out_type: {int64, string}")
     .Attr("internal_type: {int64, string}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Generates sparse cross form a list of sparse tensors.
 
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index f4fedd766de..95c55a03dd3 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -75,6 +75,7 @@ import abc
 import collections
 import math
 
+from tensorflow.contrib.framework.python.framework import checkpoint_utils
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
 from tensorflow.contrib.layers.python.layers import embedding_ops
 from tensorflow.contrib.layers.python.ops import bucketization_op
@@ -149,6 +150,7 @@ class _FeatureColumn(object):
     raise ValueError("Calling an abstract method.")
 
 
+# TODO(b/30410315): Support warm starting in all feature columns.
 class _SparseColumn(_FeatureColumn,
                     collections.namedtuple("_SparseColumn",
                                            ["column_name", "is_integerized",
@@ -191,35 +193,36 @@ class _SparseColumn(_FeatureColumn,
               combiner="sum",
               dtype=dtypes.string):
     if is_integerized and bucket_size is None:
-      raise ValueError("bucket_size should be set if is_integerized=True. "
+      raise ValueError("bucket_size must be set if is_integerized is True. "
                        "column_name: {}".format(column_name))
 
     if is_integerized and not dtype.is_integer:
-      raise ValueError("dtype should be an integer if is_integerized is True. "
-                       "Column {}.".format(column_name))
+      raise ValueError("dtype must be an integer if is_integerized is True. "
+                       "dtype: {}, column_name: {}.".format(dtype, column_name))
 
     if bucket_size is None and lookup_config is None:
-      raise ValueError("one of bucket_size or lookup_config should be "
-                       "set. column_name: {}".format(column_name))
+      raise ValueError("one of bucket_size or lookup_config must be set. "
+                       "column_name: {}".format(column_name))
 
     if bucket_size is not None and lookup_config:
       raise ValueError("one and only one of bucket_size or lookup_config "
-                       "should be set. column_name: {}".format(column_name))
+                       "must be set. column_name: {}".format(column_name))
 
     if bucket_size is not None and bucket_size < 2:
-      raise ValueError("bucket_size should be at least 2. "
-                       "column_name: {}".format(column_name))
+      raise ValueError("bucket_size must be at least 2. "
+                       "bucket_size: {}, column_name: {}".format(bucket_size,
+                                                                 column_name))
 
     if ((lookup_config) and
         (not isinstance(lookup_config, _SparseIdLookupConfig))):
       raise TypeError(
-          "lookup_config should be an instance of _SparseIdLookupConfig. "
+          "lookup_config must be an instance of _SparseIdLookupConfig. "
           "Given one is in type {} for column_name {}".format(
               type(lookup_config), column_name))
 
     if (lookup_config and lookup_config.vocabulary_file and
         lookup_config.vocab_size is None):
-      raise ValueError("vocab_size should be defined. "
+      raise ValueError("vocab_size must be defined. "
                        "column_name: {}".format(column_name))
 
     return super(_SparseColumn, cls).__new__(cls, column_name, is_integerized,
@@ -260,8 +263,8 @@ class _SparseColumn(_FeatureColumn,
                          input_tensor,
                          weight_collections=None,
                          trainable=True):
-    raise ValueError("Column {} is not supported in DNN. "
-                     "Please use embedding_column.".format(self))
+    raise ValueError("SparseColumn is not supported in DNN. "
+                     "Please use embedding_column. column: {}".format(self))
 
   def to_weighted_sum(self,
                       input_tensor,
@@ -277,7 +280,7 @@ class _SparseColumn(_FeatureColumn,
         initializer=init_ops.zeros_initializer,
         combiner=self.combiner,
         trainable=trainable,
-        name=self.name + "_weights")
+        name=self.name)
 
 
 class _SparseColumnIntegerized(_SparseColumn):
@@ -289,8 +292,8 @@ class _SparseColumnIntegerized(_SparseColumn):
               combiner="sum",
               dtype=dtypes.int64):
     if not dtype.is_integer:
-      raise ValueError("dtype should be an integer. Given {}".format(
-          column_name))
+      raise ValueError("dtype must be an integer. "
+                       "dtype: {}, column_name: {}".format(dtype, column_name))
 
     return super(_SparseColumnIntegerized, cls).__new__(cls,
                                                         column_name,
@@ -505,8 +508,8 @@ class _WeightedSparseColumn(_FeatureColumn, collections.namedtuple(
                          input_tensor,
                          weight_collections=None,
                          trainable=True):
-    raise ValueError("Column {} is not supported in DNN. "
-                     "Please use embedding_column.".format(self))
+    raise ValueError("WeightedSparseColumn is not supported in DNN. "
+                     "Please use embedding_column. column: {}".format(self))
 
   def to_weighted_sum(self,
                       input_tensor,
@@ -522,7 +525,7 @@ class _WeightedSparseColumn(_FeatureColumn, collections.namedtuple(
         initializer=init_ops.zeros_initializer,
         combiner=self.sparse_id_column.combiner,
         trainable=trainable,
-        name=self.name + "_weights")
+        name=self.name)
 
 
 def weighted_sparse_column(sparse_id_column,
@@ -568,7 +571,8 @@ def weighted_sparse_column(sparse_id_column,
 
 class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
     "_EmbeddingColumn",
-    ["sparse_id_column", "dimension", "combiner", "initializer"])):
+    ["sparse_id_column", "dimension", "combiner", "initializer",
+     "ckpt_to_load_from", "tensor_name_in_ckpt"])):
   """Represents an embedding column.
 
   Args:
@@ -586,15 +590,33 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
       variable initialization. If not specified, defaults to
       `tf.truncated_normal_initializer` with mean 0.0 and standard deviation
       1/sqrt(sparse_id_column.length).
+    ckpt_to_load_from: (Optional). String representing checkpoint name/pattern
+      to restore the column weights. Required if `tensor_name_in_ckpt` is not
+      None.
+    tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided
+      checkpoint from which to restore the column weights. Required if
+      `ckpt_to_load_from` is not None.
+
+  Raises:
+    ValueError: if `initializer` is specified and is not callable. Also,
+      if only one of `ckpt_to_load_from` and `tensor_name_in_ckpt` is specified.
   """
 
   def __new__(cls,
               sparse_id_column,
               dimension,
               combiner="mean",
-              initializer=None):
+              initializer=None,
+              ckpt_to_load_from=None,
+              tensor_name_in_ckpt=None):
     if initializer is not None and not callable(initializer):
-      raise ValueError("initializer must be callable if specified.")
+      raise ValueError("initializer must be callable if specified. "
+                       "Embedding of column_name: {}".format(
+                           sparse_id_column.name))
+
+    if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
+      raise ValueError("Must specify both `ckpt_to_load_from` and "
+                       "`tensor_name_in_ckpt` or none of them.")
     if initializer is None:
       stddev = 1 / math.sqrt(sparse_id_column.length)
       # TODO(b/25671353): Better initial value?
@@ -602,7 +624,8 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
                                                           stddev=stddev)
     return super(_EmbeddingColumn, cls).__new__(cls, sparse_id_column,
                                                 dimension, combiner,
-                                                initializer)
+                                                initializer, ckpt_to_load_from,
+                                                tensor_name_in_ckpt)
 
   @property
   def name(self):
@@ -645,7 +668,7 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
                          input_tensor,
                          weight_collections=None,
                          trainable=True):
-    output, _ = _create_embedding_lookup(
+    output, embedding_weights = _create_embedding_lookup(
         input_tensor=self.sparse_id_column.id_tensor(input_tensor),
         weight_tensor=self.sparse_id_column.weight_tensor(input_tensor),
         vocab_size=self.length,
@@ -654,7 +677,14 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
         initializer=self.initializer,
         combiner=self.combiner,
         trainable=trainable,
-        name=self.name + "_weights")
+        name=self.name)
+    if self.ckpt_to_load_from is not None:
+      weights_to_restore = embedding_weights
+      if len(embedding_weights) == 1:
+        weights_to_restore = embedding_weights[0]
+      checkpoint_utils.init_from_checkpoint(
+          self.ckpt_to_load_from,
+          {self.tensor_name_in_ckpt: weights_to_restore})
     return output
 
   # pylint: disable=unused-argument
@@ -663,19 +693,22 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
                       num_outputs=1,
                       weight_collections=None,
                       trainable=True):
-    raise ValueError("Column {} is not supported in linear models. "
-                     "Please use sparse_column.".format(self))
+    raise ValueError("EmbeddingColumn is not supported in linear models. "
+                     "Please use sparse_column. column: {}".format(self))
 
 
 def embedding_column(sparse_id_column,
                      dimension,
                      combiner="mean",
-                     initializer=None):
+                     initializer=None,
+                     ckpt_to_load_from=None,
+                     tensor_name_in_ckpt=None):
   """Creates an _EmbeddingColumn.
 
   Args:
     sparse_id_column: A _SparseColumn which is created by `sparse_column_with_*`
-      functions. Note that `combiner` defined in `sparse_id_column` is ignored.
+      or crossed_column functions. Note that `combiner` defined in
+      `sparse_id_column` is ignored.
     dimension: An integer specifying dimension of the embedding.
     combiner: A string specifying how to reduce if there are multiple entries
       in a single row. Currently "mean", "sqrtn" and "sum" are supported. Each
@@ -688,11 +721,18 @@ def embedding_column(sparse_id_column,
       variable initialization. If not specified, defaults to
       `tf.truncated_normal_initializer` with mean 0.0 and standard deviation
       1/sqrt(sparse_id_column.length).
+    ckpt_to_load_from: (Optional). String representing checkpoint name/pattern
+      to restore the column weights. Required if `tensor_name_in_ckpt` is not
+      None.
+    tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided
+      checkpoint from which to restore the column weights. Required if
+      `ckpt_to_load_from` is not None.
 
   Returns:
     An _EmbeddingColumn.
   """
-  return _EmbeddingColumn(sparse_id_column, dimension, combiner, initializer)
+  return _EmbeddingColumn(sparse_id_column, dimension, combiner, initializer,
+                          ckpt_to_load_from, tensor_name_in_ckpt)
 
 
 class _HashedEmbeddingColumn(collections.namedtuple(
@@ -707,7 +747,8 @@ class _HashedEmbeddingColumn(collections.namedtuple(
               combiner="mean",
               initializer=None):
     if initializer is not None and not callable(initializer):
-      raise ValueError("initializer must be callable if specified.")
+      raise ValueError("initializer must be callable if specified. "
+                       "column_name: {}".format(column_name))
     if initializer is None:
       stddev = 0.1
       # TODO(b/25671353): Better initial value?
@@ -733,7 +774,7 @@ class _HashedEmbeddingColumn(collections.namedtuple(
                          weight_collections=None,
                          trainable=True):
     embeddings = _create_embeddings(
-        name=self.name + "_weights",
+        name=self.name,
         shape=[self.size],
         initializer=self.initializer,
         dtype=dtypes.float32,
@@ -778,10 +819,14 @@ def hashed_embedding_column(column_name,
 
   """
   if (dimension < 1) or (size < 1):
-    raise ValueError("Dimension and size must be greater than 0.")
+    raise ValueError("Dimension and size must be greater than 0. "
+                     "dimension: {}, size: {}, column_name: {}".format(
+                         dimension, size, column_name))
 
   if combiner not in ("mean", "sqrtn", "sum"):
-    raise ValueError("Combiner must be one of 'mean', 'sqrtn' or 'sum'.")
+    raise ValueError("Combiner must be one of 'mean', 'sqrtn' or 'sum'. "
+                     "combiner: {}, column_name: {}".format(
+                         combiner, column_name))
 
   return _HashedEmbeddingColumn(column_name, size, dimension, combiner,
                                 initializer)
@@ -892,14 +937,18 @@ def real_valued_column(column_name,
   """
 
   if not isinstance(dimension, int):
-    raise TypeError("dimension must be an integer")
+    raise TypeError("dimension must be an integer. "
+                    "dimension: {}, column_name: {}".format(dimension,
+                                                            column_name))
 
   if dimension < 1:
-    raise ValueError("dimension must be greater than 0")
+    raise ValueError("dimension must be greater than 0. "
+                     "dimension: {}, column_name: {}".format(dimension,
+                                                             column_name))
 
   if not (dtype.is_integer or dtype.is_floating):
-    raise ValueError("dtype is not convertible to tf.float32. Given {}".format(
-        dtype))
+    raise ValueError("dtype must be convertible to float. "
+                     "dtype: {}, column_name: {}".format(dtype, column_name))
 
   if default_value is None:
     return _RealValuedColumn(column_name, dimension, default_value, dtype)
@@ -920,9 +969,10 @@ def real_valued_column(column_name,
 
   if isinstance(default_value, list):
     if len(default_value) != dimension:
-      raise ValueError("The length of default_value is not equal to the "
-                       "value of dimension. default_value is {}.".format(
-                           default_value))
+      raise ValueError(
+          "The length of default_value must be equal to dimension. "
+          "default_value: {}, dimension: {}, column_name: {}".format(
+              default_value, dimension, column_name))
     # Check if the values in the list are all integers or are convertible to
     # floats.
     is_list_all_int = True
@@ -943,8 +993,9 @@ def real_valued_column(column_name,
         default_value = [float(v) for v in default_value]
         return _RealValuedColumn(column_name, dimension, default_value, dtype)
 
-  raise TypeError("default_value is not compatible with dtype. "
-                  "default_value is {}.".format(default_value))
+  raise TypeError("default_value must be compatible with dtype. "
+                  "default_value: {}, dtype: {}, column_name: {}".format(
+                      default_value, dtype, column_name))
 
 
 class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
@@ -971,10 +1022,12 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
   def __new__(cls, source_column, boundaries):
     if not isinstance(source_column, _RealValuedColumn):
       raise TypeError(
-          "source_column should be an instance of _RealValuedColumn.")
+          "source_column must be an instance of _RealValuedColumn. "
+          "source_column: {}".format(source_column))
 
     if not isinstance(boundaries, list) or not boundaries:
-      raise ValueError("boundaries must be a list and it should not be empty.")
+      raise ValueError("boundaries must be a non-empty list. "
+                       "boundaries: {}".format(boundaries))
 
     # We allow bucket boundaries to be monotonically increasing
     # (ie a[i+1] >= a[i]). When two bucket boundaries are the same, we
@@ -986,7 +1039,8 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
       elif boundaries[i] < boundaries[i + 1]:
         sanitized_boundaries.append(boundaries[i])
       else:
-        raise ValueError("boundaries must be a sorted list")
+        raise ValueError("boundaries must be a sorted list. "
+                         "boundaries: {}".format(boundaries))
     sanitized_boundaries.append(boundaries[len(boundaries) - 1])
 
     return super(_BucketizedColumn, cls).__new__(cls, source_column,
@@ -1067,7 +1121,7 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
         initializer=init_ops.zeros_initializer,
         combiner="sum",
         trainable=trainable,
-        name=self.name + "_weights")
+        name=self.name)
 
 
 def bucketized_column(source_column, boundaries):
@@ -1087,7 +1141,8 @@ def bucketized_column(source_column, boundaries):
 
 
 class _CrossedColumn(_FeatureColumn, collections.namedtuple(
-    "_CrossedColumn", ["columns", "hash_bucket_size", "combiner"])):
+    "_CrossedColumn", ["columns", "hash_bucket_size", "combiner",
+                       "ckpt_to_load_from", "tensor_name_in_ckpt"])):
   """Represents a cross transformation also known as composition or union.
 
   Instances of this class are immutable. It crosses given `columns`. Crossed
@@ -1124,13 +1179,19 @@ class _CrossedColumn(_FeatureColumn, collections.namedtuple(
         * "mean": do l1 normalization
         * "sqrtn": do l2 normalization
       For more information: `tf.embedding_lookup_sparse`.
+    ckpt_to_load_from: (Optional). String representing checkpoint name/pattern
+      to restore the column weights. Required if `tensor_name_in_ckpt` is not
+      None.
+    tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided
+      checkpoint from which to restore the column weights. Required if
+      `ckpt_to_load_from` is not None.
 
   Raises:
     TypeError: if all items in columns are not an instance of _SparseColumn,
       _CrossedColumn, or _BucketizedColumn or
       hash_bucket_size is not an int.
-    ValueError: if hash_bucket_size is not > 1 or
-      len(columns) is not > 1.
+    ValueError: if hash_bucket_size is not > 1 or len(columns) is not > 1. Also,
+      if only one of `ckpt_to_load_from` and `tensor_name_in_ckpt` is specified.
   """
 
   @staticmethod
@@ -1138,26 +1199,36 @@ class _CrossedColumn(_FeatureColumn, collections.namedtuple(
     return isinstance(column,
                       (_SparseColumn, _CrossedColumn, _BucketizedColumn))
 
-  def __new__(cls, columns, hash_bucket_size, combiner="sum"):
+  def __new__(cls, columns, hash_bucket_size, combiner="sum",
+              ckpt_to_load_from=None, tensor_name_in_ckpt=None):
     for column in columns:
       if not _CrossedColumn._is_crossable(column):
-        raise TypeError("columns should be a set of "
-                        "_SparseColumn, _CrossedColumn, or _BucketizedColumn. "
-                        "Column is {}".format(column))
+        raise TypeError("columns must be a set of _SparseColumn, "
+                        "_CrossedColumn, or _BucketizedColumn instances. "
+                        "column: {}".format(column))
 
     if len(columns) < 2:
-      raise ValueError("columns should contain at least 2 elements.")
+      raise ValueError("columns must contain at least 2 elements. "
+                       "columns: {}".format(columns))
 
     if not isinstance(hash_bucket_size, int):
-      raise TypeError("hash_bucket_size should be an int.")
+      raise TypeError("hash_bucket_size must be an int. "
+                      "hash_bucket_size: {}".format(hash_bucket_size))
 
     if hash_bucket_size < 2:
-      raise ValueError("hash_bucket_size should be at least 2.")
+      raise ValueError("hash_bucket_size must be at least 2. "
+                       "hash_bucket_size: {}".format(hash_bucket_size))
+
+    if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
+      raise ValueError("Must specify both `ckpt_to_load_from` and "
+                       "`tensor_name_in_ckpt` or none of them.")
 
     sorted_columns = sorted([column for column in columns],
                             key=lambda column: column.name)
     return super(_CrossedColumn, cls).__new__(cls, tuple(sorted_columns),
-                                              hash_bucket_size, combiner)
+                                              hash_bucket_size, combiner,
+                                              ckpt_to_load_from,
+                                              tensor_name_in_ckpt)
 
   @property
   def name(self):
@@ -1181,6 +1252,15 @@ class _CrossedColumn(_FeatureColumn, collections.namedtuple(
     """Returns a string which will be used as a key when we do sorting."""
     return "{}".format(self)
 
+  def id_tensor(self, input_tensor):
+    """Returns the id tensor from the given transformed input_tensor."""
+    return input_tensor
+
+  # pylint: disable=unused-argument
+  def weight_tensor(self, input_tensor):
+    """Returns the weight tensor from the given transformed input_tensor."""
+    return None
+
   def insert_transformed_feature(self, columns_to_tensors):
     """Handles cross transformation."""
 
@@ -1215,15 +1295,15 @@ class _CrossedColumn(_FeatureColumn, collections.namedtuple(
                          input_tensor,
                          weight_collections=None,
                          trainable=True):
-    raise ValueError("Column {} is not supported in DNN. "
-                     "Please use embedding_column.".format(self))
+    raise ValueError("CrossedColumn is not supported in DNN. "
+                     "Please use embedding_column. column: {}".format(self))
 
   def to_weighted_sum(self,
                       input_tensor,
                       num_outputs=1,
                       weight_collections=None,
                       trainable=True):
-    return _create_embedding_lookup(
+    output, embedding_weights = _create_embedding_lookup(
         input_tensor=input_tensor,
         weight_tensor=None,
         vocab_size=self.length,
@@ -1232,10 +1312,20 @@ class _CrossedColumn(_FeatureColumn, collections.namedtuple(
         initializer=init_ops.zeros_initializer,
         combiner=self.combiner,
         trainable=trainable,
-        name=self.name + "_weights")
+        name=self.name)
+    if self.ckpt_to_load_from is not None:
+      weights_to_restore = embedding_weights
+      if len(embedding_weights) == 1:
+        weights_to_restore = embedding_weights[0]
+      checkpoint_utils.init_from_checkpoint(
+          self.ckpt_to_load_from,
+          {self.tensor_name_in_ckpt: weights_to_restore})
+    return output, embedding_weights
 
 
-def crossed_column(columns, hash_bucket_size, combiner="sum"):
+def crossed_column(columns, hash_bucket_size, combiner="sum",
+                   ckpt_to_load_from=None,
+                   tensor_name_in_ckpt=None):
   """Creates a _CrossedColumn.
 
   Args:
@@ -1243,6 +1333,12 @@ def crossed_column(columns, hash_bucket_size, combiner="sum"):
       _SparseColumn, _CrossedColumn, or _BucketizedColumn.
     hash_bucket_size: An int that is > 1. The number of buckets.
     combiner: A combiner string, supports sum, mean, sqrtn.
+    ckpt_to_load_from: (Optional). String representing checkpoint name/pattern
+      to restore the column weights. Required if `tensor_name_in_ckpt` is not
+      None.
+    tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided
+      checkpoint from which to restore the column weights. Required if
+      `ckpt_to_load_from` is not None.
 
   Returns:
     A _CrossedColumn.
@@ -1254,12 +1350,14 @@ def crossed_column(columns, hash_bucket_size, combiner="sum"):
     ValueError: if hash_bucket_size is not > 1 or
       len(columns) is not > 1.
   """
-  return _CrossedColumn(columns, hash_bucket_size, combiner=combiner)
+  return _CrossedColumn(columns, hash_bucket_size, combiner=combiner,
+                        ckpt_to_load_from=ckpt_to_load_from,
+                        tensor_name_in_ckpt=tensor_name_in_ckpt)
 
 
 class DataFrameColumn(_FeatureColumn,
                       collections.namedtuple("DataFrameColumn",
-                                             ["name", "series"])):
+                                             ["column_name", "series"])):
   """Represents a feature column produced from a `DataFrame`.
 
   Instances of this class are immutable.  A `DataFrame` column may be dense or
@@ -1267,13 +1365,17 @@ class DataFrameColumn(_FeatureColumn,
   batch_size.
 
   Args:
-    name: a name for this column
+    column_name: a name for this column
     series: a `Series` to be wrapped, which has already had its base features
       substituted with `PredefinedSeries`.
   """
 
-  def __new__(cls, name, series):
-    return super(DataFrameColumn, cls).__new__(cls, name, series)
+  def __new__(cls, column_name, series):
+    return super(DataFrameColumn, cls).__new__(cls, column_name, series)
+
+  @property
+  def name(self):
+    return self.column_name
 
   @property
   def config(self):
@@ -1301,7 +1403,17 @@ class DataFrameColumn(_FeatureColumn,
                          input_tensor,
                          weight_collections=None,
                          trainable=True):
-    return input_tensor
+    # DataFrame typically provides Tensors of shape [batch_size],
+    # but Estimator requires shape [batch_size, 1]
+    dims = input_tensor.get_shape().ndims
+    if dims == 0:
+      raise ValueError(
+          "Can't build input layer from tensor of shape (): {}".format(
+              self.column_name))
+    elif dims == 1:
+      return array_ops.expand_dims(input_tensor, 1)
+    else:
+      return input_tensor
 
   # TODO(soergel): This mirrors RealValuedColumn for now, but should become
   # better abstracted with less code duplication when we add other kinds.
@@ -1469,7 +1581,7 @@ def _create_embeddings(name, shape, dtype, initializer, trainable,
   with just one variable.
 
   Args:
-    name: A string specifying the name of the embedding variable.
+    name: A string. The name of the embedding variable will be name + _weights.
     shape: shape of the embeddding. Note this is not the shape of partitioned
       variables.
     dtype: type of the embedding. Also the shape of each partitioned variable.
@@ -1531,7 +1643,7 @@ def _create_embedding_lookup(input_tensor, weight_tensor, vocab_size, dimension,
     A Tensor with shape [batch_size, dimension] and embedding Variable.
   """
 
-  embeddings = _create_embeddings(name=name,
+  embeddings = _create_embeddings(name=name + "_weights",
                                   shape=[vocab_size, dimension],
                                   dtype=dtypes.float32,
                                   initializer=initializer,
@@ -1543,4 +1655,4 @@ def _create_embedding_lookup(input_tensor, weight_tensor, vocab_size, dimension,
       sparse_weights=weight_tensor,
       default_id=0,
       combiner=combiner,
-      name=name), embeddings
+      name=name + "_weights"), embeddings
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index 207f86dc8be..7041aaad3e6 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -393,6 +393,24 @@ class InputLayerTest(tf.test.TestCase):
       tf.initialize_all_tables().run()
       self.assertAllEqual(output.eval().shape, [2, 10])
 
+  def testEmbeddingColumnWitCrossedColumn(self):
+    a = tf.contrib.layers.sparse_column_with_hash_bucket("aaa",
+                                                         hash_bucket_size=100)
+    b = tf.contrib.layers.sparse_column_with_hash_bucket("bbb",
+                                                         hash_bucket_size=100)
+    crossed = tf.contrib.layers.crossed_column(
+        set([a, b]), hash_bucket_size=10000)
+    wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
+                                  indices=[[0, 0], [1, 0], [1, 1]],
+                                  shape=[2, 2])
+    features = {"aaa": wire_tensor, "bbb": wire_tensor}
+    embeded_sparse = tf.contrib.layers.embedding_column(crossed, 10)
+    output = tf.contrib.layers.input_from_feature_columns(features,
+                                                          [embeded_sparse])
+    with self.test_session():
+      tf.initialize_all_variables().run()
+      self.assertAllEqual(output.eval().shape, [2, 10])
+
   def testSparseColumn(self):
     hashed_sparse = tf.contrib.layers.sparse_column_with_hash_bucket("wire", 10)
     wire_tensor = tf.SparseTensor(values=["omar", "stringer", "marlo"],
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index 187fadfce6f..6f1393da4d4 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import tensorflow as tf
 
 
@@ -58,14 +60,17 @@ class FeatureColumnTest(tf.test.TestCase):
     self.assertEqual(b.dimension, 10)
     self.assertTrue(b.default_value is None)
 
-    # dimension is an integer
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(TypeError, "dimension must be an integer"):
       tf.contrib.layers.real_valued_column("d3", dimension=1.0)
 
-    # dimension is a positive integer
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(ValueError,
+                                 "dimension must be greater than 0"):
       tf.contrib.layers.real_valued_column("d3", dimension=0)
 
+    with self.assertRaisesRegexp(ValueError,
+                                 "dtype must be convertible to float"):
+      tf.contrib.layers.real_valued_column("d3", dtype=tf.string)
+
     # default_value is an integer.
     c1 = tf.contrib.layers.real_valued_column("c1", default_value=2)
     self.assertListEqual(list(c1.default_value), [2.])
@@ -90,15 +95,18 @@ class FeatureColumnTest(tf.test.TestCase):
                                               dimension=4,
                                               default_value=2.)
     self.assertListEqual(list(d2.default_value), [2., 2., 2., 2.])
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(TypeError,
+                                 "default_value must be compatible with dtype"):
       tf.contrib.layers.real_valued_column("d3",
                                            default_value=2.,
                                            dtype=tf.int32)
 
-    # default_value is neither interger nor float.
-    with self.assertRaises(TypeError):
+    # default_value is neither integer nor float.
+    with self.assertRaisesRegexp(
+        TypeError, "default_value must be compatible with dtype"):
       tf.contrib.layers.real_valued_column("e1", default_value="string")
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(
+        TypeError, "default_value must be compatible with dtype"):
       tf.contrib.layers.real_valued_column("e1",
                                            dimension=3,
                                            default_value=[1, 3., "string"])
@@ -123,11 +131,13 @@ class FeatureColumnTest(tf.test.TestCase):
                                               dimension=3,
                                               default_value=[2., 2, 2])
     self.assertListEqual(list(g2.default_value), [2., 2., 2.])
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(
+        TypeError, "default_value must be compatible with dtype"):
       tf.contrib.layers.real_valued_column("g3",
                                            default_value=[2.],
                                            dtype=tf.int32)
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(
+        ValueError, "The length of default_value must be equal to dimension"):
       tf.contrib.layers.real_valued_column("g4",
                                            dimension=3,
                                            default_value=[2.])
@@ -138,11 +148,19 @@ class FeatureColumnTest(tf.test.TestCase):
     self.assertEqual(a.name, "aaa_BUCKETIZED")
 
   def testBucketizedColumnRequiresRealValuedColumn(self):
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(
+        TypeError, "source_column must be an instance of _RealValuedColumn"):
       tf.contrib.layers.bucketized_column("bbb", [0])
+    with self.assertRaisesRegexp(
+        TypeError, "source_column must be an instance of _RealValuedColumn"):
+      tf.contrib.layers.bucketized_column(
+          tf.contrib.layers.sparse_column_with_integerized_feature(
+              column_name="bbb", bucket_size=10),
+          [0])
 
   def testBucketizedColumnRequiresSortedBuckets(self):
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(
+        ValueError, "boundaries must be a sorted list"):
       tf.contrib.layers.bucketized_column(
           tf.contrib.layers.real_valued_column("ccc"), [5, 0, 4])
 
@@ -171,7 +189,10 @@ class FeatureColumnTest(tf.test.TestCase):
   def testCrossedColumnNotSupportRealValuedColumn(self):
     b = tf.contrib.layers.sparse_column_with_hash_bucket("bbb",
                                                          hash_bucket_size=100)
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(
+        TypeError,
+        "columns must be a set of _SparseColumn, _CrossedColumn, "
+        "or _BucketizedColumn instances"):
       tf.contrib.layers.crossed_column(
           set([b, tf.contrib.layers.real_valued_column("real")]),
           hash_bucket_size=10000)
@@ -192,7 +213,8 @@ class FeatureColumnTest(tf.test.TestCase):
          "weights": tf.VarLenFeature(tf.int32)},
         weighted_ids.config)
 
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(ValueError,
+                                 "dtype is not convertible to float"):
       weighted_ids = tf.contrib.layers.weighted_sparse_column(ids, "weights",
                                                               dtype=tf.string)
 
@@ -209,7 +231,8 @@ class FeatureColumnTest(tf.test.TestCase):
             [1], dtype=tf.int32)},
         rvc.config)
 
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(ValueError,
+                                 "dtype must be convertible to float"):
       tf.contrib.layers.real_valued_column("rvc", dtype=tf.string)
 
   def testSparseColumnDtypes(self):
@@ -220,7 +243,8 @@ class FeatureColumnTest(tf.test.TestCase):
         "sc", 10, dtype=tf.int32)
     self.assertDictEqual({"sc": tf.VarLenFeature(dtype=tf.int32)}, sc.config)
 
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(ValueError,
+                                 "dtype must be an integer"):
       tf.contrib.layers.sparse_column_with_integerized_feature("sc",
                                                                10,
                                                                dtype=tf.float32)
@@ -323,6 +347,107 @@ class FeatureColumnTest(tf.test.TestCase):
     self.assertEqual(tf.float32, placeholder.dtype)
     self.assertEqual([None, 1], placeholder.get_shape().as_list())
 
+  def testInitEmbeddingColumnWeightsFromCkpt(self):
+    sparse_col = tf.contrib.layers.sparse_column_with_hash_bucket(
+        column_name="object_in_image",
+        hash_bucket_size=4)
+    # Create _EmbeddingColumn which randomly initializes embedding of size
+    # [4, 16].
+    embedding_col = tf.contrib.layers.embedding_column(sparse_col, dimension=16)
+
+    # Creating a SparseTensor which has all the ids possible for the given
+    # vocab.
+    input_tensor = tf.SparseTensor(indices=[[0, 0], [1, 1], [2, 2], [3, 3]],
+                                   values=[0, 1, 2, 3],
+                                   shape=[4, 4])
+
+    # Invoking 'embedding_column.to_dnn_input_layer' will create the embedding
+    # variable. Creating under scope 'run_1' so as to prevent name conflicts
+    # when creating embedding variable for 'embedding_column_pretrained'.
+    with tf.variable_scope("run_1"):
+      # This will return a [4, 16] tensor which is same as embedding variable.
+      embeddings = embedding_col.to_dnn_input_layer(input_tensor)
+
+    save = tf.train.Saver()
+    checkpoint_path = os.path.join(self.get_temp_dir(), "model.ckpt")
+
+    with self.test_session() as sess:
+      sess.run(tf.initialize_all_variables())
+      saved_embedding = embeddings.eval()
+      save.save(sess, checkpoint_path)
+
+    embedding_col_initialized = tf.contrib.layers.embedding_column(
+        sparse_id_column=sparse_col,
+        dimension=16,
+        ckpt_to_load_from=checkpoint_path,
+        tensor_name_in_ckpt="run_1/object_in_image_embedding_weights")
+
+    with tf.variable_scope("run_2"):
+      # This will initialize the embedding from provided checkpoint and return a
+      # [4, 16] tensor which is same as embedding variable. Since we didn't
+      # modify embeddings, this should be same as 'saved_embedding'.
+      pretrained_embeddings = embedding_col_initialized.to_dnn_input_layer(
+          input_tensor)
+
+    with self.test_session() as sess:
+      sess.run(tf.initialize_all_variables())
+      loaded_embedding = pretrained_embeddings.eval()
+
+    self.assertAllClose(saved_embedding, loaded_embedding)
+
+  def testInitCrossedColumnWeightsFromCkpt(self):
+    sparse_col_1 = tf.contrib.layers.sparse_column_with_hash_bucket(
+        column_name="col_1", hash_bucket_size=4)
+    sparse_col_2 = tf.contrib.layers.sparse_column_with_hash_bucket(
+        column_name="col_2", hash_bucket_size=4)
+
+    crossed_col = tf.contrib.layers.crossed_column(
+        columns=[sparse_col_1, sparse_col_2],
+        hash_bucket_size=4)
+
+    input_tensor = tf.SparseTensor(indices=[[0, 0], [1, 1], [2, 2], [3, 3]],
+                                   values=[0, 1, 2, 3],
+                                   shape=[4, 4])
+
+    # Invoking 'crossed_col.to_weighted_sum' will create the crossed column
+    # weights variable.
+    with tf.variable_scope("run_1"):
+      # Returns looked up column weights which is same as crossed column weights
+      # as well as actual references to weights variables.
+      col_weights, weights = crossed_col.to_weighted_sum(input_tensor)
+      # Update the weights since default initializer initializes all weights to
+      # 0.0.
+      for weight in weights:
+        assign_op = tf.assign(weight, weight + 0.5)
+
+    save = tf.train.Saver()
+    checkpoint_path = os.path.join(self.get_temp_dir(), "model.ckpt")
+
+    with self.test_session() as sess:
+      sess.run(tf.initialize_all_variables())
+      sess.run(assign_op)
+      saved_col_weights = col_weights.eval()
+      save.save(sess, checkpoint_path)
+
+    crossed_col_initialized = tf.contrib.layers.crossed_column(
+        columns=[sparse_col_1, sparse_col_2],
+        hash_bucket_size=4,
+        ckpt_to_load_from=checkpoint_path,
+        tensor_name_in_ckpt="run_1/col_1_X_col_2_weights")
+
+    with tf.variable_scope("run_2"):
+      # This will initialize the crossed column weights from provided checkpoint
+      # and return a [4, 1] tensor which is same as weights variable. Since we
+      # won't modify weights, this should be same as 'saved_col_weights'.
+      col_weights_from_ckpt, _ = crossed_col_initialized.to_weighted_sum(
+          input_tensor)
+
+    with self.test_session() as sess:
+      sess.run(tf.initialize_all_variables())
+      loaded_col_weights = col_weights_from_ckpt.eval()
+
+    self.assertAllClose(saved_col_weights, loaded_col_weights)
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/contrib/layers/python/layers/initializers.py b/tensorflow/contrib/layers/python/layers/initializers.py
index 03665a4951e..1786b71dcf7 100644
--- a/tensorflow/contrib/layers/python/layers/initializers.py
+++ b/tensorflow/contrib/layers/python/layers/initializers.py
@@ -102,12 +102,13 @@ def variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False,
     TypeError: if `mode` is not in ['FAN_IN', 'FAN_OUT', 'FAN_AVG'].
   """
   if not dtype.is_floating:
-    raise TypeError('Cannot create initializer for non-floating point '
-                    'type.')
+    raise TypeError('Cannot create initializer for non-floating point type.')
   if mode not in ['FAN_IN', 'FAN_OUT', 'FAN_AVG']:
     raise TypeError('Unknow mode %s [FAN_IN, FAN_OUT, FAN_AVG]', mode)
   def _initializer(shape, dtype=dtype):
     """Initializer function."""
+    if not dtype.is_floating:
+      raise TypeError('Cannot create initializer for non-floating point type.')
     # Estimating fan_in and fan_out is not possible to do perfectly, but we try.
     # This is the right thing for matrix multiply and convolutions.
     fan_in = float(shape[-2])
diff --git a/tensorflow/contrib/layers/python/layers/initializers_test.py b/tensorflow/contrib/layers/python/layers/initializers_test.py
index bacf16c1ad8..d619dd8ee09 100644
--- a/tensorflow/contrib/layers/python/layers/initializers_test.py
+++ b/tensorflow/contrib/layers/python/layers/initializers_test.py
@@ -64,6 +64,11 @@ class VarianceScalingInitializerTest(tf.test.TestCase):
         TypeError,
         'Cannot create initializer for non-floating point type.'):
       tf.contrib.layers.variance_scaling_initializer(dtype=tf.int32)
+    initializer = tf.contrib.layers.variance_scaling_initializer()
+    with self.assertRaisesRegexp(
+        TypeError,
+        'Cannot create initializer for non-floating point type.'):
+      initializer([], dtype=tf.int32)
 
   def _test_variance(self, initializer, shape, variance, factor, mode, uniform):
     with tf.Graph().as_default() as g:
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index e4a25fa1138..1f63b8a0151 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -75,25 +75,24 @@ def avg_pool2d(inputs,
                padding='VALID',
                outputs_collections=None,
                scope=None):
-  """Adds a Avg Pooling op.
+  """Adds a 2D average pooling op.
 
-  It is assumed by the wrapper that the pooling is only done per image and not
-  in depth or batch.
+  It is assumed that the pooling is done per image but not in batch or channels.
 
   Args:
-    inputs: a tensor of size [batch_size, height, width, depth].
-    kernel_size: a list of length 2: [kernel_height, kernel_width] of the
+    inputs: A `Tensor` of size [batch_size, height, width, channels].
+    kernel_size: A list of length 2: [kernel_height, kernel_width] of the
       pooling kernel over which the op is computed. Can be an int if both
       values are the same.
-    stride: a list of length 2: [stride_height, stride_width].
-      Can be an int if both strides are the same.  Note that presently
+    stride: A list of length 2: [stride_height, stride_width].
+      Can be an int if both strides are the same. Note that presently
       both strides must have the same value.
-    padding: the padding method, either 'VALID' or 'SAME'.
-    outputs_collections: collection to add the outputs.
+    padding: The padding method, either 'VALID' or 'SAME'.
+    outputs_collections: The collections to which the outputs are added.
     scope: Optional scope for op_scope.
 
   Returns:
-    a tensor representing the results of the pooling operation.
+    A `Tensor` representing the results of the pooling operation.
   """
   with ops.op_scope([inputs], scope, 'AvgPool2D') as sc:
     inputs = ops.convert_to_tensor(inputs)
@@ -843,27 +842,27 @@ def max_pool2d(inputs,
                padding='VALID',
                outputs_collections=None,
                scope=None):
-  """Adds a Max Pooling op.
+  """Adds a 2D Max Pooling op.
 
-  It is assumed by the wrapper that the pooling is only done per image and not
-  in depth or batch.
+  It is assumed that the pooling is done per image but not in batch or channels.
 
   Args:
-    inputs: a tensor of size [batch_size, height, width, depth].
-    kernel_size: a list of length 2: [kernel_height, kernel_width] of the
+    inputs: A `Tensor` of size [batch_size, height, width, channels].
+    kernel_size: A list of length 2: [kernel_height, kernel_width] of the
       pooling kernel over which the op is computed. Can be an int if both
       values are the same.
-    stride: a list of length 2: [stride_height, stride_width].
-      Can be an int if both strides are the same.  Note that presently
+    stride: A list of length 2: [stride_height, stride_width].
+      Can be an int if both strides are the same. Note that presently
       both strides must have the same value.
-    padding: the padding method, either 'VALID' or 'SAME'.
-    outputs_collections: collection to add the outputs.
+    padding: The padding method, either 'VALID' or 'SAME'.
+    outputs_collections: The collections to which the outputs are added.
     scope: Optional scope for op_scope.
 
   Returns:
-    a tensor representing the results of the pooling operation.
+    A `Tensor` representing the results of the pooling operation.
+
   Raises:
-    ValueError: if 'kernel_size' is not a 2-D list
+    ValueError: If 'kernel_size' is not a 2-D list
   """
   with ops.op_scope([inputs], scope, 'MaxPool2D') as sc:
     inputs = ops.convert_to_tensor(inputs)
@@ -1037,6 +1036,7 @@ def separable_convolution2d(
     depthwise_weights = variables.model_variable(
         'depthwise_weights',
         shape=depthwise_shape,
+        dtype=dtype,
         initializer=weights_initializer,
         regularizer=weights_regularizer,
         trainable=trainable,
@@ -1049,6 +1049,7 @@ def separable_convolution2d(
       pointwise_weights = variables.model_variable(
           'pointwise_weights',
           shape=pointwise_shape,
+          dtype=dtype,
           initializer=weights_initializer,
           regularizer=weights_regularizer,
           trainable=trainable,
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index dc962ac76eb..4d849894051 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -30,59 +30,52 @@ class AvgPool2DTest(tf.test.TestCase):
 
   def testCreateAvgPool(self):
     height, width = 3, 3
-    with self.test_session():
-      images = np.random.uniform(size=(5, height, width, 3))
-      output = tf.contrib.layers.avg_pool2d(images, [3, 3])
-      self.assertEquals(output.op.name, 'AvgPool2D/AvgPool')
-      self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
+    images = np.random.uniform(size=(5, height, width, 3))
+    output = tf.contrib.layers.avg_pool2d(images, [3, 3])
+    self.assertEquals(output.op.name, 'AvgPool2D/AvgPool')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
 
   def testCollectOutputs(self):
     height, width = 3, 3
-    with self.test_session():
-      images = tf.random_uniform((5, height, width, 3), seed=1)
-      output = tf.contrib.layers.avg_pool2d(images, [3, 3],
-                                            outputs_collections='outputs')
-      c_output = tf.get_collection('outputs')[0]
-      self.assertEquals(c_output.name, 'AvgPool2D')
-      self.assertEquals(c_output.outputs, output)
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    output = tf.contrib.layers.avg_pool2d(images, [3, 3],
+                                          outputs_collections='outputs')
+    output_collection = tf.get_collection('outputs')[0]
+    self.assertEquals(output_collection.name, 'AvgPool2D')
+    self.assertEquals(output_collection.outputs, output)
 
   def testCreateSquareAvgPool(self):
     height, width = 3, 3
-    with self.test_session():
-      images = tf.random_uniform((5, height, width, 3), seed=1)
-      output = tf.contrib.layers.avg_pool2d(images, 3)
-      self.assertEquals(output.op.name, 'AvgPool2D/AvgPool')
-      self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    output = tf.contrib.layers.avg_pool2d(images, 3)
+    self.assertEquals(output.op.name, 'AvgPool2D/AvgPool')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
 
   def testCreateAvgPoolWithScope(self):
     height, width = 3, 3
-    with self.test_session():
-      images = tf.random_uniform((5, height, width, 3), seed=1)
-      output = tf.contrib.layers.avg_pool2d(images, [3, 3], scope='pool1')
-      self.assertEquals(output.op.name, 'pool1/AvgPool')
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    output = tf.contrib.layers.avg_pool2d(images, [3, 3], scope='pool1')
+    self.assertEquals(output.op.name, 'pool1/AvgPool')
 
-  def testCreateAvgPoolSAME(self):
+  def testCreateAvgPoolWithSamePadding(self):
     height, width = 3, 3
-    with self.test_session():
-      images = tf.random_uniform((5, height, width, 3), seed=1)
-      output = tf.contrib.layers.avg_pool2d(images, [3, 3], padding='SAME')
-      self.assertListEqual(output.get_shape().as_list(), [5, 2, 2, 3])
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    output = tf.contrib.layers.avg_pool2d(images, [3, 3], padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 2, 3])
 
-  def testCreateAvgPoolStrideSAME(self):
+  def testCreateAvgPoolStrideWithSamePadding(self):
     height, width = 3, 3
-    with self.test_session():
-      images = tf.random_uniform((5, height, width, 3), seed=1)
-      output = tf.contrib.layers.avg_pool2d(images, [3, 3], stride=1,
-                                            padding='SAME')
-      self.assertListEqual(output.get_shape().as_list(), [5, height, width, 3])
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    output = tf.contrib.layers.avg_pool2d(images, [3, 3], stride=1,
+                                          padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, height, width, 3])
 
   def testGlobalAvgPool(self):
     height, width = 3, 3
-    with self.test_session():
-      images = tf.random_uniform((5, height, width, 3), seed=1)
-      output = tf.contrib.layers.avg_pool2d(images, images.get_shape()[1:3],
-                                            stride=1)
-      self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    output = tf.contrib.layers.avg_pool2d(images, images.get_shape()[1:3],
+                                          stride=1)
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
 
 
 class BiasAddTest(tf.test.TestCase):
@@ -825,7 +818,7 @@ class DropoutTest(tf.test.TestCase):
     with self.test_session():
       images = np.random.uniform(size=(5, height, width, 3))
       output = tf.contrib.layers.dropout(images)
-      self.assertEquals(output.op.name, 'Dropout/dropout/mul_1')
+      self.assertEquals(output.op.name, 'Dropout/dropout/mul')
       output.get_shape().assert_is_compatible_with(
           tf.convert_to_tensor(images).get_shape())
 
@@ -835,7 +828,7 @@ class DropoutTest(tf.test.TestCase):
       is_training = tf.constant(True)
       images = tf.random_uniform((5, height, width, 3), seed=1)
       output = tf.contrib.layers.dropout(images, is_training=is_training)
-      self.assertEquals(output.op.name, 'Dropout/dropout/mul_1')
+      self.assertEquals(output.op.name, 'Dropout/dropout/mul')
       output.get_shape().assert_is_compatible_with(images.get_shape())
 
   def testCreateDropoutWithConstantFalse(self):
@@ -1502,59 +1495,52 @@ class MaxPool2DTest(tf.test.TestCase):
 
   def testCreateMaxPool(self):
     height, width = 3, 3
-    with self.test_session():
-      images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
-      output = tf.contrib.layers.max_pool2d(images, [3, 3])
-      self.assertEquals(output.op.name, 'MaxPool2D/MaxPool')
-      self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
+    images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
+    output = tf.contrib.layers.max_pool2d(images, [3, 3])
+    self.assertEquals(output.op.name, 'MaxPool2D/MaxPool')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
 
   def testCollectOutputs(self):
     height, width = 3, 3
-    with self.test_session():
-      images = tf.random_uniform((5, height, width, 3), seed=1)
-      output = tf.contrib.layers.max_pool2d(images, [3, 3],
-                                            outputs_collections='outputs')
-      c_output = tf.get_collection('outputs')[0]
-      self.assertEquals(c_output.name, 'MaxPool2D')
-      self.assertEquals(c_output.outputs, output)
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    output = tf.contrib.layers.max_pool2d(images, [3, 3],
+                                          outputs_collections='outputs')
+    outputs_collection = tf.get_collection('outputs')[0]
+    self.assertEquals(outputs_collection.name, 'MaxPool2D')
+    self.assertEquals(outputs_collection.outputs, output)
 
   def testCreateSquareMaxPool(self):
     height, width = 3, 3
-    with self.test_session():
-      images = tf.random_uniform((5, height, width, 3), seed=1)
-      output = tf.contrib.layers.max_pool2d(images, 3)
-      self.assertEquals(output.op.name, 'MaxPool2D/MaxPool')
-      self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    output = tf.contrib.layers.max_pool2d(images, 3)
+    self.assertEquals(output.op.name, 'MaxPool2D/MaxPool')
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
 
   def testCreateMaxPoolWithScope(self):
     height, width = 3, 3
-    with self.test_session():
-      images = tf.random_uniform((5, height, width, 3), seed=1)
-      output = tf.contrib.layers.max_pool2d(images, [3, 3], scope='pool1')
-      self.assertEquals(output.op.name, 'pool1/MaxPool')
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    output = tf.contrib.layers.max_pool2d(images, [3, 3], scope='pool1')
+    self.assertEquals(output.op.name, 'pool1/MaxPool')
 
-  def testCreateMaxPoolSAME(self):
+  def testCreateMaxPoolWithSamePadding(self):
     height, width = 3, 3
-    with self.test_session():
-      images = tf.random_uniform((5, height, width, 3), seed=1)
-      output = tf.contrib.layers.max_pool2d(images, [3, 3], padding='SAME')
-      self.assertListEqual(output.get_shape().as_list(), [5, 2, 2, 3])
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    output = tf.contrib.layers.max_pool2d(images, [3, 3], padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 2, 3])
 
-  def testCreateMaxPoolStrideSAME(self):
+  def testCreateMaxPoolStrideWithSamePadding(self):
     height, width = 3, 3
-    with self.test_session():
-      images = tf.random_uniform((5, height, width, 3), seed=1)
-      output = tf.contrib.layers.max_pool2d(images, [3, 3], stride=1,
-                                            padding='SAME')
-      self.assertListEqual(output.get_shape().as_list(), [5, height, width, 3])
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    output = tf.contrib.layers.max_pool2d(images, [3, 3], stride=1,
+                                          padding='SAME')
+    self.assertListEqual(output.get_shape().as_list(), [5, height, width, 3])
 
   def testGlobalMaxPool(self):
     height, width = 3, 3
-    with self.test_session():
-      images = tf.random_uniform((5, height, width, 3), seed=1)
-      output = tf.contrib.layers.max_pool2d(images, images.get_shape()[1:3],
-                                            stride=1)
-      self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
+    images = tf.random_uniform((5, height, width, 3), seed=1)
+    output = tf.contrib.layers.max_pool2d(images, images.get_shape()[1:3],
+                                          stride=1)
+    self.assertListEqual(output.get_shape().as_list(), [5, 1, 1, 3])
 
 
 class OneHotEncodingTest(tf.test.TestCase):
@@ -1618,10 +1604,28 @@ class RepeatTests(tf.test.TestCase):
 
 class SeparableConv2dTest(tf.test.TestCase):
 
-  def testCreateConv(self):
+  def testCreateConvInt32(self):
     height, width = 3, 3
     with self.test_session():
-      images = tf.random_uniform((5, height, width, 3), seed=1)
+      images = tf.random_uniform(
+          (5, height, width, 3), seed=1, dtype=tf.int32, maxval=12345)
+      with self.assertRaisesRegexp(TypeError, 'non-floating point type'):
+        tf.contrib.layers.separable_conv2d(images, 32, [3, 3], 2)
+
+  def testCreateConvFloat32(self):
+    height, width = 3, 3
+    with self.test_session():
+      images = tf.random_uniform(
+          (5, height, width, 3), seed=1, dtype=tf.float32)
+      output = tf.contrib.layers.separable_conv2d(images, 32, [3, 3], 2)
+      self.assertEquals(output.op.name, 'SeparableConv2d/Relu')
+      self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32])
+
+  def testCreateConvFloat64(self):
+    height, width = 3, 3
+    with self.test_session():
+      images = tf.random_uniform(
+          (5, height, width, 3), seed=1, dtype=tf.float64)
       output = tf.contrib.layers.separable_conv2d(images, 32, [3, 3], 2)
       self.assertEquals(output.op.name, 'SeparableConv2d/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32])
diff --git a/tensorflow/contrib/layers/python/layers/optimizers.py b/tensorflow/contrib/layers/python/layers/optimizers.py
index 3da6c7c9711..e6235ca67b0 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as vars_
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as optimizer_
 from tensorflow.python.training import training as train
 
@@ -43,6 +44,13 @@ OPTIMIZER_CLS_NAMES = {
     "SGD": train.GradientDescentOptimizer,
 }
 
+OPTIMIZER_SUMMARIES = [
+    "learning_rate",
+    "loss",
+    "gradients",
+    "gradient_norm",
+]
+
 
 def optimize_loss(loss,
                   global_step,
@@ -51,11 +59,12 @@ def optimize_loss(loss,
                   gradient_noise_scale=None,
                   gradient_multipliers=None,
                   clip_gradients=None,
-                  moving_average_decay=0.9,
+                  moving_average_decay=None,
                   learning_rate_decay_fn=None,
                   update_ops=None,
                   variables=None,
-                  name=None):
+                  name=None,
+                  summaries=None):
   """Given loss and parameters for optimizer, returns a training op.
 
   Args:
@@ -75,8 +84,8 @@ def optimize_loss(loss,
                           If present, gradients for specified
                           variables will be multiplied by given constant.
     clip_gradients: float or `None`, clips gradients by this value.
-    moving_average_decay: float or None, takes into account previous loss
-                          to make learning smoother due to outliers.
+    moving_average_decay: Deprecated. float or None, takes into account previous
+                          loss to make learning smoother due to outliers.
     learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                             `Tensor`s, returns `Tensor`.
                             Can be used to implement any learning rate decay
@@ -87,6 +96,9 @@ def optimize_loss(loss,
     variables: list of variables to optimize or
                `None` to use all trainable variables.
     name: The name for this operation is used to scope operations and summaries.
+    summaries: List of internal quantities to visualize on tensorboard. If not
+               set only the loss and the learning rate will be reported. The
+               complete list is in OPTIMIZER_SUMMARIES.
 
   Returns:
     Training op.
@@ -96,8 +108,8 @@ def optimize_loss(loss,
   """
   with vs.variable_op_scope([loss, global_step], name, "OptimizeLoss"):
     # Update ops take UPDATE_OPS collection if not provided.
-    update_ops = (set(update_ops or []) or
-                  set(ops.get_collection(ops.GraphKeys.UPDATE_OPS)))
+    if update_ops is None:
+      update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
     # Make sure update ops are ran before computing loss.
     if update_ops:
       with ops.control_dependencies(update_ops):
@@ -105,7 +117,10 @@ def optimize_loss(loss,
       loss = control_flow_ops.with_dependencies([barrier], loss)
 
     # Moving average of the loss with decay.
+    # TODO(b/30439864): moving_average_decay should be removed.
     if moving_average_decay is not None:
+      logging.warn("'moving_average_decay' is deprecated. Please use "
+                   "tensorboard's builtin averaging instead.")
       # Generate moving averages of the loss.
       loss_averages = train.ExponentialMovingAverage(moving_average_decay,
                                                      name="avg")
@@ -125,9 +140,12 @@ def optimize_loss(loss,
       raise ValueError("Learning rate should be 0d Tensor or float. "
                        "Got %s of type %s" % (
                            str(learning_rate), str(type(learning_rate))))
+    if summaries is None:
+      summaries = ["loss", "learning_rate"]
     if learning_rate_decay_fn is not None:
       lr = learning_rate_decay_fn(lr, global_step)
-      logging_ops.scalar_summary("learning_rate", lr)
+      if "learning_rate" in summaries:
+        logging_ops.scalar_summary("learning_rate", lr)
 
     # Create optimizer, given specified parameters.
     if isinstance(optimizer, six.string_types):
@@ -167,7 +185,8 @@ def optimize_loss(loss,
       gradients = _clip_gradients_by_norm(gradients, clip_gradients)
 
     # Add scalar summary for loss.
-    logging_ops.scalar_summary("loss", loss)
+    if "loss" in summaries:
+      logging_ops.scalar_summary("loss", loss)
 
     # Add histograms for variables, gradients and gradient norms.
     for gradient, variable in gradients:
@@ -177,10 +196,12 @@ def optimize_loss(loss,
         grad_values = gradient
 
       if grad_values is not None:
-        logging_ops.histogram_summary(variable.name, variable)
-        logging_ops.histogram_summary(variable.name + "/gradients", grad_values)
-        logging_ops.histogram_summary(variable.name + "/gradient_norm",
-                                      clip_ops.global_norm([grad_values]))
+        if "gradients" in summaries:
+          logging_ops.histogram_summary(variable.name + "/gradients",
+                                        grad_values)
+        if "gradient_norm" in summaries:
+          logging_ops.histogram_summary(variable.name + "/gradient_norm",
+                                        clip_ops.global_norm([grad_values]))
 
     # Create gradient updates.
     grad_updates = opt.apply_gradients(gradients,
diff --git a/tensorflow/contrib/layers/python/layers/optimizers_test.py b/tensorflow/contrib/layers/python/layers/optimizers_test.py
index a38a8fe90bb..ad22490af78 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers_test.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers_test.py
@@ -75,7 +75,8 @@ class OptimizersTest(tf.test.TestCase):
       tf.initialize_all_variables().run()
       session.run(train, feed_dict={x: 5})
       var_value, global_step_value = session.run([var, global_step])
-      self.assertAlmostEqual(var_value, 8.58150, 4)
+      # Due to randomness the following number may change if graph is different.
+      self.assertAlmostEqual(var_value, 8.5591021, 4)
       self.assertEqual(global_step_value, 1)
 
   def testGradientNoiseWithClipping(self):
diff --git a/tensorflow/contrib/layers/python/layers/target_column.py b/tensorflow/contrib/layers/python/layers/target_column.py
index 9f321895025..78178816f35 100644
--- a/tensorflow/contrib/layers/python/layers/target_column.py
+++ b/tensorflow/contrib/layers/python/layers/target_column.py
@@ -22,6 +22,7 @@ import inspect
 
 import six
 
+from tensorflow.contrib import losses
 from tensorflow.contrib import metrics as metrics_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -29,7 +30,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.ops import nn_ops
 
 
 def regression_target(label_name=None,
@@ -70,7 +70,7 @@ def multi_class_target(n_classes, label_name=None, weight_column_name=None):
       will be multiplied by the loss of the example.
 
   Returns:
-    An instance of _TargetColumn
+    An instance of _MultiClassTargetColumn.
 
   Raises:
     ValueError: if n_classes is < 2
@@ -297,8 +297,17 @@ class _BinarySvmTargetColumn(_MultiClassTargetColumn):
   """_TargetColumn for binary classification using SVMs."""
 
   def __init__(self, label_name, weight_column_name):
+    def loss_fn(logits, target):
+      check_shape_op = logging_ops.Assert(
+          math_ops.less_equal(array_ops.rank(target), 2),
+          ["target's shape should be either [batch_size, 1] or [batch_size]"])
+      with ops.control_dependencies([check_shape_op]):
+        target = array_ops.reshape(
+            target, shape=[array_ops.shape(target)[0], 1])
+      return losses.hinge_loss(logits, target)
+
     super(_BinarySvmTargetColumn, self).__init__(
-        loss_fn=_binary_hinge_loss,
+        loss_fn=loss_fn,
         n_classes=2,
         label_name=label_name,
         weight_column_name=weight_column_name)
@@ -331,22 +340,6 @@ def _log_loss_with_two_classes(logits, target):
   return loss_vec
 
 
-# TODO(sibyl-vie3Poto): Move this to contrib/losses/python/losses/loss_ops.py.
-def _binary_hinge_loss(logits, target):
-  """Method that returns the loss vector for binary hinge loss."""
-  check_shape_op = logging_ops.Assert(
-      math_ops.less_equal(
-          array_ops.rank(target), 2),
-      ["target's shape should be either [batch_size, 1] or [batch_size]"])
-  with ops.control_dependencies([check_shape_op]):
-    target = array_ops.reshape(target, shape=[array_ops.shape(target)[0], 1])
-  # First need to convert binary labels to -1/1 labels (as floats).
-  all_ones = array_ops.ones_like(logits)
-  labels = math_ops.sub(2 * math_ops.to_float(target), all_ones)
-  loss_vec = nn_ops.relu(math_ops.sub(all_ones, math_ops.mul(labels, logits)))
-  return loss_vec
-
-
 def _softmax_cross_entropy_loss(logits, target):
   # sigmoid_cross_entropy_with_logits requires [batch_size, 1] target.
   # Check that we got int32/int64 for classification.
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 572b025039c..c617d43a616 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -36,6 +36,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "load_csv_test",
+    size = "small",
+    srcs = ["python/learn/tests/load_csv_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
 py_test(
     name = "data_feeder_test",
     size = "small",
@@ -235,9 +247,9 @@ py_test(
 )
 
 py_test(
-    name = "compare_test",
+    name = "binary_transform_test",
     size = "small",
-    srcs = ["python/learn/tests/dataframe/compare_test.py"],
+    srcs = ["python/learn/tests/dataframe/binary_transform_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
@@ -625,19 +637,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "checkpoints_test",
-    size = "small",
-    srcs = ["python/learn/utils/checkpoints_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":learn",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
 py_test(
     name = "graph_io_test",
     size = "small",
diff --git a/tensorflow/contrib/learn/python/learn/README.md b/tensorflow/contrib/learn/python/learn/README.md
index 2016f53a8a2..6c2d0cefbbd 100644
--- a/tensorflow/contrib/learn/python/learn/README.md
+++ b/tensorflow/contrib/learn/python/learn/README.md
@@ -56,6 +56,7 @@ Below are few simple examples of the API. For more examples, please see [example
 Simple linear classification:
 
 ```python
+import tensorflow.contrib.learn.python.learn as learn
 from sklearn import datasets, metrics
 
 iris = datasets.load_iris()
@@ -70,6 +71,7 @@ print("Accuracy: %f" % score)
 Simple linear regression:
 
 ```python
+import tensorflow.contrib.learn.python.learn as learn
 from sklearn import datasets, metrics, preprocessing
 
 boston = datasets.load_boston()
@@ -85,6 +87,7 @@ print ("MSE: %f" % score)
 Example of 3 layer network with 10, 20 and 10 hidden units respectively:
 
 ```python
+import tensorflow.contrib.learn.python.learn as learn
 from sklearn import datasets, metrics
 
 iris = datasets.load_iris()
@@ -99,6 +102,7 @@ print("Accuracy: %f" % score)
 Example of how to pass a custom model to the Estimator:
 
 ```python
+import tensorflow.contrib.learn.python.learn as learn
 from sklearn import datasets, metrics
 
 iris = datasets.load_iris()
diff --git a/tensorflow/contrib/learn/python/learn/__init__.py b/tensorflow/contrib/learn/python/learn/__init__.py
index 9b7a31ede42..50089e18a03 100644
--- a/tensorflow/contrib/learn/python/learn/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/__init__.py
@@ -33,6 +33,7 @@ from tensorflow.contrib.learn.python.learn import preprocessing
 from tensorflow.contrib.learn.python.learn import utils
 from tensorflow.contrib.learn.python.learn.dataframe import *
 from tensorflow.contrib.learn.python.learn.estimators import *
+from tensorflow.contrib.learn.python.learn.evaluable import Evaluable
 from tensorflow.contrib.learn.python.learn.experiment import Experiment
 from tensorflow.contrib.learn.python.learn.graph_actions import evaluate
 from tensorflow.contrib.learn.python.learn.graph_actions import infer
@@ -41,4 +42,5 @@ from tensorflow.contrib.learn.python.learn.graph_actions import run_feeds
 from tensorflow.contrib.learn.python.learn.graph_actions import run_n
 from tensorflow.contrib.learn.python.learn.graph_actions import train
 from tensorflow.contrib.learn.python.learn.learn_io import *
+from tensorflow.contrib.learn.python.learn.trainable import Trainable
 # pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/__init__.py b/tensorflow/contrib/learn/python/learn/dataframe/__init__.py
index e066463b947..8fba9b65136 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/__init__.py
@@ -29,11 +29,14 @@ from tensorflow.contrib.learn.python.learn.dataframe.transform import Transform
 
 # Transforms
 from tensorflow.contrib.learn.python.learn.dataframe.transforms.boolean_mask import BooleanMask
+from tensorflow.contrib.learn.python.learn.dataframe.transforms.difference import Difference
+from tensorflow.contrib.learn.python.learn.dataframe.transforms.hashes import HashFast
 from tensorflow.contrib.learn.python.learn.dataframe.transforms.in_memory_source import NumpySource
 from tensorflow.contrib.learn.python.learn.dataframe.transforms.in_memory_source import PandasSource
 from tensorflow.contrib.learn.python.learn.dataframe.transforms.reader_source import ReaderSource
 from tensorflow.contrib.learn.python.learn.dataframe.transforms.sum import Sum
 
+
 # pylint: disable=g-import-not-at-top,g-bad-import-order
 
 # Unary Transform registration
@@ -42,9 +45,9 @@ for ut_def in _ut.UNARY_TRANSFORMS:
   _ut.register_unary_op(*ut_def)
 
 # Comparison Transform registration
-from tensorflow.contrib.learn.python.learn.dataframe.transforms import  compare as _cmp
-for ct_def in _cmp.COMPARISON_TRANSFORMS:
-  _cmp.register_comparison_ops(*ct_def)
+from tensorflow.contrib.learn.python.learn.dataframe.transforms import binary_transforms as _bt
+for bt_def in _bt.BINARY_TRANSFORMS:
+  _bt.register_binary_op(*bt_def)
 
 __all__ = ['DataFrame', 'Series', 'PredefinedSeries', 'TransformedSeries',
            'TensorFlowDataFrame', 'parameter', 'Transform']
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py b/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py
index 31093b9937a..6e03f086425 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py
@@ -117,10 +117,11 @@ class DataFrame(object):
       value = [value]
     self.assign(**dict(zip(key, value)))
 
-  def build(self):
+  def build(self, **kwargs):
     # We do not allow passing a cache here, because that would encourage
     # working around the rule that DataFrames cannot be expected to be
     # synced with each other (e.g., they shuffle independently).
     cache = {}
-    tensors = {name: c.build(cache) for name, c in self._columns.items()}
+    tensors = {name: c.build(cache, **kwargs)
+               for name, c in self._columns.items()}
     return tensors
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py b/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py
index bff0c4e4af0..313ae41cfe8 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py
@@ -91,7 +91,8 @@ def _build_alternate_universe(
 def to_feature_columns_and_input_fn(dataframe,
                                     base_input_keys_with_defaults,
                                     feature_keys,
-                                    target_keys=None):
+                                    target_keys=None,
+                                    **kwargs):
   """Build a list of FeatureColumns and an input_fn for use with Estimator.
 
   Args:
@@ -103,6 +104,7 @@ def to_feature_columns_and_input_fn(dataframe,
       These may include base features and/or derived features.
     target_keys: the names of columns to be used as targets.  None is
       acceptable for unsupervised learning.
+    **kwargs: Additional keyword arguments, unused here.
 
   Returns:
     A tuple of two elements:
@@ -155,10 +157,11 @@ def to_feature_columns_and_input_fn(dataframe,
 
   # Build an input_fn suitable for use with Estimator.
   def input_fn():
+    """An input_fn() for feeding the given set of DataFrameColumns."""
     # It's important to build all the tensors together in one DataFrame.
     # If we did df.select() for both key sets and then build those, the two
     # resulting DataFrames would be shuffled independently.
-    tensors = limited_dataframe.build()
+    tensors = limited_dataframe.build(**kwargs)
 
     base_input_features = {key: tensors[key] for key in base_input_keys}
     targets = {key: tensors[key] for key in target_keys}
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/series.py b/tensorflow/contrib/learn/python/learn/dataframe/series.py
index 12daa7d7cb8..5893db3aad2 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/series.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/series.py
@@ -98,7 +98,7 @@ class Series(object):
       return transform_cls
     return register
 
-  def build(self, cache):
+  def build(self, cache, **kwargs):
     """Returns a Tensor."""
     raise NotImplementedError()
 
@@ -122,7 +122,7 @@ class PredefinedSeries(Series):
   def required_base_features(self):
     return {self.name: self.feature_spec}
 
-  def build(self, cache):
+  def build(self, cache, **kwargs):
     try:
       return cache[self.name]
     except KeyError:
@@ -171,10 +171,11 @@ class TransformedSeries(Series):
       result.update(s.required_base_features)
     return result
 
-  def build(self, cache=None):
+  def build(self, cache=None, **kwargs):
     if cache is None:
       cache = {}
-    all_outputs = self._transform.build_transitive(self._input_series, cache)
+    all_outputs = self._transform.build_transitive(
+        self._input_series, cache, **kwargs)
     return getattr(all_outputs, self._output_name)
 
   def __repr__(self):
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py b/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
index 45df3ac16d5..ddd2b8bfb6e 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
@@ -28,6 +28,7 @@ from tensorflow.contrib.learn.python.learn.dataframe import dataframe as df
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import batch
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import csv_parser
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import example_parser
+from tensorflow.contrib.learn.python.learn.dataframe.transforms import hashes
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import in_memory_source
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import reader_source
 from tensorflow.contrib.learn.python.learn.dataframe.transforms import sparsify
@@ -83,7 +84,8 @@ class TensorFlowDataFrame(df.DataFrame):
           graph=None,
           session=None,
           start_queues=True,
-          initialize_variables=True):
+          initialize_variables=True,
+          **kwargs):
     """Builds and runs the columns of the `DataFrame` and yields batches.
 
     This is a generator that yields a dictionary mapping column names to
@@ -97,6 +99,7 @@ class TensorFlowDataFrame(df.DataFrame):
       start_queues: if true, queues will be started before running and halted
         after producting `n` batches.
       initialize_variables: if true, variables will be initialized.
+      **kwargs: Additional keyword arguments e.g. `num_epochs`.
 
     Yields:
       A dictionary, mapping column names to the values resulting from running
@@ -107,7 +110,7 @@ class TensorFlowDataFrame(df.DataFrame):
     with graph.as_default():
       if session is None:
         session = sess.Session()
-      self_built = self.build()
+      self_built = self.build(**kwargs)
       keys = list(self_built.keys())
       cols = list(self_built.values())
       if initialize_variables:
@@ -157,6 +160,52 @@ class TensorFlowDataFrame(df.DataFrame):
             "Original error: {}").format(type(col), e))
     return result
 
+  def split(self, index_series, proportion, batch_size=None):
+    """Deterministically split a `DataFrame` into two `DataFrame`s.
+
+    Note this split is only as deterministic as the underlying hash function;
+    see `tf.string_to_hash_bucket_fast`.  The hash function is deterministic
+    for a given binary, but may change occasionally.  The only way to achieve
+    an absolute guarantee that the split `DataFrame`s do not change across runs
+    is to materialize them.
+
+    Note too that the allocation of a row to one partition or the
+    other is evaluated independently for each row, so the exact number of rows
+    in each partition is binomially distributed.
+
+    Args:
+      index_series: a `Series` of unique strings, whose hash will determine the
+        partitioning; or the name in this `DataFrame` of such a `Series`.
+        (This `Series` must contain strings because TensorFlow provides hash
+        ops only for strings, and there are no number-to-string converter ops.)
+      proportion: The proportion of the rows to select for the 'left'
+        partition; the remaining (1 - proportion) rows form the 'right'
+        partition.
+      batch_size: the batch size to use when rebatching the left and right
+        `DataFrame`s.  If None (default), the `DataFrame`s are not rebatched;
+        thus their batches will have variable sizes, according to which rows
+        are selected from each batch of the original `DataFrame`.
+
+    Returns:
+      Two `DataFrame`s containing the partitioned rows.
+    """
+    # TODO(soergel): allow seed?
+    if isinstance(index_series, str):
+      index_series = self[index_series]
+    num_buckets = 1000000  # close enough for simple splits
+    hashed_input, = hashes.HashFast(num_buckets)(index_series)
+    threshold = int(num_buckets * proportion)
+    left = hashed_input < threshold
+    right = ~left
+    left_rows = self.select_rows(left)
+    right_rows = self.select_rows(right)
+
+    if batch_size:
+      left_rows = left_rows.batch(batch_size=batch_size, shuffle=False)
+      right_rows = right_rows.batch(batch_size=batch_size, shuffle=False)
+
+    return left_rows, right_rows
+
   def run_once(self):
     """Creates a new 'Graph` and `Session` and runs a single batch.
 
@@ -208,7 +257,7 @@ class TensorFlowDataFrame(df.DataFrame):
 
   @classmethod
   def _from_csv_base(cls, filepatterns, get_default_values, has_header,
-                     column_names, num_epochs, num_threads, enqueue_size,
+                     column_names, num_threads, enqueue_size,
                      batch_size, queue_capacity, min_after_dequeue, shuffle,
                      seed):
     """Create a `DataFrame` from CSV files.
@@ -223,9 +272,6 @@ class TensorFlowDataFrame(df.DataFrame):
         each column, given the column names.
       has_header: whether or not the CSV files have headers.
       column_names: a list of names for the columns in the CSV files.
-      num_epochs: the number of times that the reader should loop through all
-        the file names. If set to `None`, then the reader will continue
-        indefinitely.
       num_threads: the number of readers that will work in parallel.
       enqueue_size: block size for each read operation.
       batch_size: desired batch size.
@@ -265,7 +311,6 @@ class TensorFlowDataFrame(df.DataFrame):
         reader_kwargs=reader_kwargs,
         enqueue_size=enqueue_size,
         batch_size=batch_size,
-        num_epochs=num_epochs,
         queue_capacity=queue_capacity,
         shuffle=shuffle,
         min_after_dequeue=min_after_dequeue,
@@ -287,7 +332,6 @@ class TensorFlowDataFrame(df.DataFrame):
                default_values,
                has_header=True,
                column_names=None,
-               num_epochs=None,
                num_threads=1,
                enqueue_size=None,
                batch_size=32,
@@ -306,9 +350,6 @@ class TensorFlowDataFrame(df.DataFrame):
       default_values: a list of default values for each column.
       has_header: whether or not the CSV files have headers.
       column_names: a list of names for the columns in the CSV files.
-      num_epochs: the number of times that the reader should loop through all
-        the file names. If set to `None`, then the reader will continue
-        indefinitely.
       num_threads: the number of readers that will work in parallel.
       enqueue_size: block size for each read operation.
       batch_size: desired batch size.
@@ -332,7 +373,7 @@ class TensorFlowDataFrame(df.DataFrame):
       return default_values
 
     return cls._from_csv_base(filepatterns, get_default_values, has_header,
-                              column_names, num_epochs, num_threads,
+                              column_names, num_threads,
                               enqueue_size, batch_size, queue_capacity,
                               min_after_dequeue, shuffle, seed)
 
@@ -342,7 +383,6 @@ class TensorFlowDataFrame(df.DataFrame):
                                  feature_spec,
                                  has_header=True,
                                  column_names=None,
-                                 num_epochs=None,
                                  num_threads=1,
                                  enqueue_size=None,
                                  batch_size=32,
@@ -362,9 +402,6 @@ class TensorFlowDataFrame(df.DataFrame):
           `VarLenFeature`.
       has_header: whether or not the CSV files have headers.
       column_names: a list of names for the columns in the CSV files.
-      num_epochs: the number of times that the reader should loop through all
-        the file names. If set to `None`, then the reader will continue
-        indefinitely.
       num_threads: the number of readers that will work in parallel.
       enqueue_size: block size for each read operation.
       batch_size: desired batch size.
@@ -387,7 +424,7 @@ class TensorFlowDataFrame(df.DataFrame):
       return [_get_default_value(feature_spec[name]) for name in column_names]
 
     dataframe = cls._from_csv_base(filepatterns, get_default_values, has_header,
-                                   column_names, num_epochs, num_threads,
+                                   column_names, num_threads,
                                    enqueue_size, batch_size, queue_capacity,
                                    min_after_dequeue, shuffle, seed)
 
@@ -405,7 +442,6 @@ class TensorFlowDataFrame(df.DataFrame):
                     filepatterns,
                     features,
                     reader_cls=io_ops.TFRecordReader,
-                    num_epochs=None,
                     num_threads=1,
                     enqueue_size=None,
                     batch_size=32,
@@ -421,9 +457,6 @@ class TensorFlowDataFrame(df.DataFrame):
         `FixedLenFeature`.
       reader_cls: a subclass of `tensorflow.ReaderBase` that will be used to
         read the `Example`s.
-      num_epochs: the number of times that the reader should loop through all
-        the file names. If set to `None`, then the reader will continue
-        indefinitely.
       num_threads: the number of readers that will work in parallel.
       enqueue_size: block size for each read operation.
       batch_size: desired batch size.
@@ -454,7 +487,6 @@ class TensorFlowDataFrame(df.DataFrame):
         filenames,
         enqueue_size=enqueue_size,
         batch_size=batch_size,
-        num_epochs=num_epochs,
         queue_capacity=queue_capacity,
         shuffle=shuffle,
         min_after_dequeue=min_after_dequeue,
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transform.py b/tensorflow/contrib/learn/python/learn/dataframe/transform.py
index 745d556f929..bbb97d2f290 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transform.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transform.py
@@ -223,13 +223,14 @@ class Transform(object):
     # pylint: disable=not-callable
     return self.return_type(*output_series)
 
-  def build_transitive(self, input_series, cache=None):
+  def build_transitive(self, input_series, cache=None, **kwargs):
     """Apply this `Transform` to the provided `Series`, producing 'Tensor's.
 
     Args:
       input_series: None, a `Series`, or a list of input `Series`, acting as
          positional arguments.
       cache: a dict from Series reprs to Tensors.
+      **kwargs: Additional keyword arguments, unused here.
 
     Returns:
       A namedtuple of the output Tensors.
@@ -244,7 +245,7 @@ class Transform(object):
     if len(input_series) != self.input_valency:
       raise ValueError("Expected %s input Series but received %s." %
                        (self.input_valency, len(input_series)))
-    input_tensors = [series.build(cache) for series in input_series]
+    input_tensors = [series.build(cache, **kwargs) for series in input_series]
 
     # Note we cache each output individually, not just the entire output
     # tuple.  This allows using the graph as the cache, since it can sensibly
@@ -254,7 +255,7 @@ class Transform(object):
     output_tensors = [cache.get(output_repr) for output_repr in output_reprs]
 
     if None in output_tensors:
-      result = self._apply_transform(input_tensors)
+      result = self._apply_transform(input_tensors, **kwargs)
       for output_name, output_repr in zip(self.output_names, output_reprs):
         cache[output_repr] = getattr(result, output_name)
     else:
@@ -264,12 +265,13 @@ class Transform(object):
     return result
 
   @abstractmethod
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     """Applies the transformation to the `transform_input`.
 
     Args:
-        input_tensors: a list of Tensors representing the input to
+      input_tensors: a list of Tensors representing the input to
         the Transform.
+      **kwargs: Additional keyword arguments, unused here.
 
     Returns:
         A namedtuple of Tensors representing the transformed output.
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py
index 352a028ee33..cf1585634ca 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/batch.py
@@ -72,7 +72,7 @@ class Batch(AbstractBatchTransform):
   def name(self):
     return "Batch"
 
-  def _apply_transform(self, transform_input):
+  def _apply_transform(self, transform_input, **kwargs):
     batched = input_ops.batch(transform_input,
                               batch_size=self.batch_size,
                               num_threads=self.num_threads,
@@ -121,7 +121,7 @@ class ShuffleBatch(AbstractBatchTransform):
   def seed(self):
     return self._seed
 
-  def _apply_transform(self, transform_input):
+  def _apply_transform(self, transform_input, **kwargs):
     batched = input_ops.shuffle_batch(transform_input,
                                       batch_size=self.batch_size,
                                       capacity=self.queue_capacity,
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/compare.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/binary_transforms.py
similarity index 52%
rename from tensorflow/contrib/learn/python/learn/dataframe/transforms/compare.py
rename to tensorflow/contrib/learn/python/learn/dataframe/transforms/binary_transforms.py
index f0f6f4b69b9..78a21250c9c 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/compare.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/binary_transforms.py
@@ -1,4 +1,4 @@
-# Copyright 2016 Google Inc. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""Transforms for comparing pairs of `Series`."""
+"""Transforms that wrap binary TensorFlow operations."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -27,25 +26,24 @@ from tensorflow.python.ops import math_ops
 # Each entry is a mapping from registered_name to operation. Each operation is
 # wrapped in a transform and then registered as a member function
 # `Series`.registered_name().
-COMPARISON_TRANSFORMS = [("__eq__", math_ops.equal),
-                         ("__gt__", math_ops.greater),
-                         ("__ge__", math_ops.greater_equal),
-                         ("__lt__", math_ops.less),
-                         ("__le__", math_ops.less_equal)]
+BINARY_TRANSFORMS = [("__eq__", math_ops.equal),
+                     ("__gt__", math_ops.greater),
+                     ("__ge__", math_ops.greater_equal),
+                     ("__lt__", math_ops.less),
+                     ("__le__", math_ops.less_equal),
+                     ("__mul__", math_ops.mul),
+                     ("__div__", math_ops.div),
+                     ("__truediv__", math_ops.truediv),
+                     ("__floordiv__", math_ops.floordiv),
+                     ("__mod__", math_ops.mod),
+                     ("pow", math_ops.pow)]
 
-SERIES_DOC_FORMAT_STRING = (
-    "A `Transform` that uses `{0}` to compare two Series. "
-    "Documentation for `{0}`: \n\n {1}"
-)
-
-SCALAR_DOC_FORMAT_STRING = (
-    "A `Transform` that uses `{0}` to compare a Series and a scalar. "
-    "Documentation for `{0}`: \n\n {1}"
-)
+_DOC_FORMAT_STRING = ("A `Transform` that wraps `{0}`. "
+                      "Documentation for `{0}`: \n\n {1}")
 
 
-class SeriesComparisonTransform(transform.Transform):
-  """Parent class for `Transform`s that compare `Series` elementwise."""
+class SeriesBinaryTransform(transform.Transform):
+  """Parent class for `Transform`s that operate on two `Series`."""
 
   @property
   def input_valency(self):
@@ -55,32 +53,31 @@ class SeriesComparisonTransform(transform.Transform):
   def _output_names(self):
     return "output",
 
-  def _apply_transform(self, input_tensors):
-    # TODO(jamieas): consider supporting sparse comparisons.
+  def _apply_transform(self, input_tensors, **kwargs):
+    # TODO(jamieas): consider supporting sparse inputs.
     if isinstance(input_tensors[0], ops.SparseTensor) or isinstance(
         input_tensors[1], ops.SparseTensor):
-      raise TypeError("{} does not support SparseTensors".format(type(
-          self).__name__))
+      raise TypeError("{} does not support SparseTensors".format(
+          type(self).__name__))
 
     # pylint: disable=not-callable
-    return self.return_type(self._compare(input_tensors[0], input_tensors[1]))
+    return self.return_type(self._apply_op(input_tensors[0], input_tensors[1]))
 
 
-class ScalarComparisonTransform(transform.Transform):
-  """Parent class for `Transform`s that compare `Series` to a scalar."""
+class ScalarBinaryTransform(transform.Transform):
+  """Parent class for `Transform`s that combine `Series` to a scalar."""
 
-  def __init__(self, threshold):
-    if isinstance(threshold, series.Series):
-      raise ValueError(
-          "{} is used to compare Series with scalars. It was called with "
-          "another Series.".format(
-              type(self).__name__))
-    super(ScalarComparisonTransform, self).__init__()
-    self._threshold = threshold
+  def __init__(self, scalar):
+    if isinstance(scalar, series.Series):
+      raise ValueError("{} takes a Series and a scalar. "
+                       "It was called with another Series.".format(
+                           type(self).__name__))
+    super(ScalarBinaryTransform, self).__init__()
+    self._scalar = scalar
 
   @transform.parameter
-  def threshold(self):
-    return self._threshold
+  def scalar(self):
+    return self._scalar
 
   @property
   def input_valency(self):
@@ -90,67 +87,66 @@ class ScalarComparisonTransform(transform.Transform):
   def _output_names(self):
     return "output",
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     input_tensor = input_tensors[0]
     if isinstance(input_tensor, ops.SparseTensor):
       result = ops.SparseTensor(input_tensor.indices,
-                                self._compare(input_tensor.values),
+                                self._apply_op(input_tensor.values),
                                 input_tensor.shape)
     else:
-      result = self._compare(input_tensor)
+      result = self._apply_op(input_tensor)
 
     # pylint: disable=not-callable
     return self.return_type(result)
 
 
 # pylint: disable=unused-argument
-def register_comparison_ops(method_name, operation):
-  """Registers `Series` member functions for comparisons.
+def register_binary_op(method_name, operation):
+  """Registers `Series` member functions for binary operations.
 
   Args:
     method_name: the name of the method that will be created in `Series`.
-    operation: TensorFlow operation used for comparison.
+    operation: underlying TensorFlow operation.
   """
 
-  # Define series-series comparison `Transform`.
+  # Define series-series `Transform`.
   @property
   def series_name(self):
     return operation.__name__
 
-  series_doc = SERIES_DOC_FORMAT_STRING.format(operation.__name__,
-                                               operation.__doc__)
-  def series_compare(self, x, y):
+  series_doc = _DOC_FORMAT_STRING.format(operation.__name__, operation.__doc__)
+
+  def series_apply_op(self, x, y):
     return operation(x, y)
 
   series_transform_cls = type("scalar_{}".format(operation.__name__),
-                              (SeriesComparisonTransform,),
+                              (SeriesBinaryTransform,),
                               {"name": series_name,
                                "__doc__": series_doc,
-                               "_compare": series_compare})
+                               "_apply_op": series_apply_op})
 
-  # Define series-scalar comparison `Transform`.
+  # Define series-scalar `Transform`.
   @property
   def scalar_name(self):
     return "scalar_{}".format(operation.__name__)
 
-  scalar_doc = SCALAR_DOC_FORMAT_STRING.format(operation.__name__,
-                                               operation.__doc__)
+  scalar_doc = _DOC_FORMAT_STRING.format(operation.__name__, operation.__doc__)
 
-  def scalar_compare(self, x):
-    return operation(x, self.threshold)
+  def scalar_apply_op(self, x):
+    return operation(x, self.scalar)
 
   scalar_transform_cls = type("scalar_{}".format(operation.__name__),
-                              (ScalarComparisonTransform,),
+                              (ScalarBinaryTransform,),
                               {"name": scalar_name,
                                "__doc__": scalar_doc,
-                               "_compare": scalar_compare})
+                               "_apply_op": scalar_apply_op})
 
   # Define function that delegates to the two `Transforms`.
-  def _comparison_fn(self, other, *args, **kwargs):
+  def _fn(self, other, *args, **kwargs):
     # pylint: disable=not-callable,abstract-class-instantiated
     if isinstance(other, series.Series):
       return series_transform_cls(*args, **kwargs)([self, other])[0]
     return scalar_transform_cls(other, *args, **kwargs)([self])[0]
 
   # Register new member function of `Series`.
-  setattr(series.Series, method_name, _comparison_fn)
+  setattr(series.Series, method_name, _fn)
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/boolean_mask.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/boolean_mask.py
index f572cf137f7..eb5a8edbfb6 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/boolean_mask.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/boolean_mask.py
@@ -77,18 +77,21 @@ class BooleanMask(transform.Transform):
   def _output_names(self):
     return "output",
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     """Applies the transformation to the `transform_input`.
 
     Args:
-        input_tensors: a list of Tensors representing the input to
+      input_tensors: a list of Tensors representing the input to
         the Transform.
+      **kwargs: Additional keyword arguments, unused here.
 
     Returns:
         A namedtuple of Tensors representing the transformed output.
     """
     input_tensor = input_tensors[0]
     mask = input_tensors[1]
+    if mask.get_shape().ndims > 1:
+      mask = array_ops.squeeze(mask)
 
     if isinstance(input_tensor, ops.SparseTensor):
       mask_fn = sparse_boolean_mask
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/csv_parser.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/csv_parser.py
index caa83f5a966..d78b5652d6e 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/csv_parser.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/csv_parser.py
@@ -58,7 +58,7 @@ class CSVParser(transform.Transform):
   def default_values(self):
     return self._default_values
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     default_consts = [constant_op.constant(d, shape=[1])
                       for d in self._default_values]
     parsed_values = parsing_ops.decode_csv(input_tensors[0],
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/densify.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/densify.py
index 2f389153178..0f0c1a08911 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/densify.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/densify.py
@@ -47,12 +47,13 @@ class Densify(transform.Transform):
   def _output_names(self):
     return "output",
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     """Applies the transformation to the `transform_input`.
 
     Args:
-        input_tensors: a list of Tensors representing the input to
+      input_tensors: a list of Tensors representing the input to
         the Transform.
+      **kwargs: Additional keyword arguments, unused here.
 
     Returns:
         A namedtuple of Tensors representing the transformed output.
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/difference.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/difference.py
new file mode 100644
index 00000000000..b585fceeb63
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/difference.py
@@ -0,0 +1,67 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A `Transform` that performs subtraction on two `Series`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.dataframe import series
+from tensorflow.contrib.learn.python.learn.dataframe import transform
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import sparse_ops
+
+
+def _negate_sparse(sparse_tensor):
+  return ops.SparseTensor(indices=sparse_tensor.indices,
+                          values=-sparse_tensor.values,
+                          shape=sparse_tensor.shape)
+
+
+@series.Series.register_binary_op("__sub__")
+class Difference(transform.Transform):
+  """Subtracts one 'Series` from another."""
+
+  def __init__(self):
+    super(Difference, self).__init__()
+
+  @property
+  def name(self):
+    return "difference"
+
+  @property
+  def input_valency(self):
+    return 2
+
+  @property
+  def _output_names(self):
+    return "output",
+
+  def _apply_transform(self, input_tensors, **kwargs):
+    pair_sparsity = (isinstance(input_tensors[0], ops.SparseTensor),
+                     isinstance(input_tensors[1], ops.SparseTensor))
+
+    if pair_sparsity == (False, False):
+      result = input_tensors[0] - input_tensors[1]
+    # note tf.sparse_add accepts the mixed cases,
+    # so long as at least one input is sparse.
+    elif not pair_sparsity[1]:
+      result = sparse_ops.sparse_add(input_tensors[0], - input_tensors[1])
+    else:
+      result = sparse_ops.sparse_add(input_tensors[0],
+                                     _negate_sparse(input_tensors[1]))
+    # pylint: disable=not-callable
+    return self.return_type(result)
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/example_parser.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/example_parser.py
index e22ef740ed9..c2c5e0cbed5 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/example_parser.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/example_parser.py
@@ -61,7 +61,7 @@ class ExampleParser(transform.Transform):
   def feature_definitions(self):
     return self._ordered_features
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     parsed_values = parsing_ops.parse_example(input_tensors[0],
                                               features=self._ordered_features)
     # pylint: disable=not-callable
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/hashes.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/hashes.py
new file mode 100644
index 00000000000..325e7827ce2
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/hashes.py
@@ -0,0 +1,68 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Masks one `Series` based on the content of another `Series`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.dataframe import transform
+from tensorflow.python.ops import string_ops
+
+
+class HashFast(transform.Transform):
+  """Perform a fast hash of a `Series`."""
+
+  def __init__(self, num_buckets):
+    """Initialize `CSVParser`.
+
+    Args:
+      num_buckets: The number of hash buckets to use.
+    """
+    # TODO(soergel): allow seed?
+    super(HashFast, self).__init__()
+    self._num_buckets = num_buckets
+
+  @property
+  def name(self):
+    return "HashFast"
+
+  @property
+  def input_valency(self):
+    return 1
+
+  @property
+  def _output_names(self):
+    return "output",
+
+  def _apply_transform(self, input_tensors, **kwargs):
+    """Applies the transformation to the `transform_input`.
+
+    Args:
+      input_tensors: a list of Tensors representing the input to
+        the Transform.
+      **kwargs: additional keyword arguments, unused here.
+
+    Returns:
+        A namedtuple of Tensors representing the transformed output.
+    """
+    result = string_ops.string_to_hash_bucket_fast(input_tensors[0],
+                                                   self._num_buckets,
+                                                   name=None)
+    # pylint: disable=not-callable
+    return self.return_type(result)
+
+
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py
index 97453c30325..d96d53468a5 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/in_memory_source.py
@@ -89,7 +89,7 @@ class BaseInMemorySource(transform.Transform):
   def input_valency(self):
     return 0
 
-  def _apply_transform(self, transform_input):
+  def _apply_transform(self, transform_input, **kwargs):
     queue = feeding_functions.enqueue_data(self.data,
                                            self.queue_capacity,
                                            self.shuffle,
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/reader_source.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/reader_source.py
index 23556c40657..ddb2d321d1c 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/reader_source.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/reader_source.py
@@ -32,7 +32,6 @@ class ReaderSource(transform.Transform):
                reader_kwargs=None,
                enqueue_size=None,
                batch_size=1,
-               num_epochs=None,
                queue_capacity=None,
                shuffle=False,
                min_after_dequeue=None,
@@ -49,9 +48,6 @@ class ReaderSource(transform.Transform):
         is constructed.
       enqueue_size: block size for each read operation.
       batch_size: The desired batch size of output. Defaults to 1.
-      num_epochs: the number of times that the reader should loop through all
-        the file names. If set to `None`, then the reader will continue
-        indefinitely.
       queue_capacity: Capacity of the queue. Defaults to 10 * `batch_size`.
       shuffle: Whether records will be shuffled before returning. Defaults to
         false.
@@ -73,7 +69,6 @@ class ReaderSource(transform.Transform):
     self._batch_size = batch_size
     self._queue_capacity = (batch_size * 10 if queue_capacity is None else
                             queue_capacity)
-    self._num_epochs = num_epochs
     self._shuffle = shuffle
     self._min_after_dequeue = int(self.queue_capacity / 4 if min_after_dequeue
                                   is None else min_after_dequeue)
@@ -100,10 +95,6 @@ class ReaderSource(transform.Transform):
   def batch_size(self):
     return self._batch_size
 
-  @transform.parameter
-  def num_epochs(self):
-    return self._num_epochs
-
   @transform.parameter
   def queue_capacity(self):
     return self._queue_capacity
@@ -136,11 +127,12 @@ class ReaderSource(transform.Transform):
   def _output_names(self):
     return ("index", "value")
 
-  def _apply_transform(self, transform_input):
-    filename_queue = input_ops.string_input_producer(self.work_units,
-                                                     num_epochs=self.num_epochs,
-                                                     shuffle=self.shuffle,
-                                                     seed=self.seed)
+  def _apply_transform(self, transform_input, **kwargs):
+    filename_queue = input_ops.string_input_producer(
+        self.work_units,
+        num_epochs=kwargs.get("num_epochs"),
+        shuffle=self.shuffle,
+        seed=self.seed)
     reader_ops = []
     for _ in range(self.num_threads):
       reader = self._reader_cls(**self._reader_kwargs)
@@ -174,7 +166,6 @@ def TextFileSource(file_names,
                    reader_kwargs=None,
                    enqueue_size=1,
                    batch_size=1,
-                   num_epochs=None,
                    queue_capacity=None,
                    shuffle=False,
                    min_after_dequeue=None,
@@ -185,7 +176,6 @@ def TextFileSource(file_names,
                       reader_kwargs=reader_kwargs,
                       enqueue_size=enqueue_size,
                       batch_size=batch_size,
-                      num_epochs=num_epochs,
                       queue_capacity=queue_capacity,
                       shuffle=shuffle,
                       min_after_dequeue=min_after_dequeue,
@@ -197,7 +187,6 @@ def TFRecordSource(file_names,
                    reader_kwargs=None,
                    enqueue_size=1,
                    batch_size=1,
-                   num_epochs=None,
                    queue_capacity=None,
                    shuffle=False,
                    min_after_dequeue=None,
@@ -208,7 +197,6 @@ def TFRecordSource(file_names,
                       reader_kwargs=reader_kwargs,
                       enqueue_size=enqueue_size,
                       batch_size=batch_size,
-                      num_epochs=num_epochs,
                       queue_capacity=queue_capacity,
                       shuffle=shuffle,
                       min_after_dequeue=min_after_dequeue,
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/sparsify.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/sparsify.py
index 552012ea330..f3447c5d940 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/sparsify.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/sparsify.py
@@ -52,12 +52,13 @@ class Sparsify(transform.Transform):
   def _output_names(self):
     return "output",
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     """Applies the transformation to the `transform_input`.
 
     Args:
-        input_tensors: a list of Tensors representing the input to
+      input_tensors: a list of Tensors representing the input to
         the Transform.
+      **kwargs: Additional keyword arguments, unused here.
 
     Returns:
         A namedtuple of Tensors representing the transformed output.
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/sum.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/sum.py
index 6b04166e09c..878b08f4b0a 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/sum.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/sum.py
@@ -1,4 +1,4 @@
-# Copyright 2016 Google Inc. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ class Sum(transform.Transform):
   def _output_names(self):
     return "output",
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     pair_sparsity = (isinstance(input_tensors[0], ops.SparseTensor),
                      isinstance(input_tensors[1], ops.SparseTensor))
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transforms/unary_transforms.py b/tensorflow/contrib/learn/python/learn/dataframe/transforms/unary_transforms.py
index 3fd8c2a6a90..7f9eb7ce1da 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transforms/unary_transforms.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transforms/unary_transforms.py
@@ -1,4 +1,4 @@
-# Copyright 2016 Google Inc. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -43,7 +43,8 @@ UNARY_TRANSFORMS = [("__neg__", math_ops.neg),
                     ("lgamma", math_ops.lgamma),
                     ("digamma", math_ops.digamma),
                     ("erf", math_ops.erf),
-                    ("erfc", math_ops.erfc)]
+                    ("erfc", math_ops.erfc),
+                    ("__invert__", math_ops.logical_not, bool)]
 
 DOC_FORMAT_STRING = (
     "A `Transform` that wraps the `{0}` operation. "
@@ -52,7 +53,7 @@ DOC_FORMAT_STRING = (
 
 
 # pylint: disable=unused-argument
-def register_unary_op(registered_name, operation):
+def register_unary_op(registered_name, operation, ignore_dtype=None):
   """Creates a `Transform` that wraps a unary tensorflow operation.
 
   If `registered_name` is specified, the `Transform` is registered as a member
@@ -62,6 +63,8 @@ def register_unary_op(registered_name, operation):
     registered_name: the name of the member function of `Series` corresponding
       to the returned `Transform`.
     operation: a unary TensorFlow operation.
+    ignore_dtype: an optional dtype, not used here but needed for symmetry with
+      test.
   """
 
   doc = DOC_FORMAT_STRING.format(operation.__name__, operation.__doc__)
@@ -78,7 +81,7 @@ def register_unary_op(registered_name, operation):
   def _output_names(self):
     return "output"
 
-  def _apply_transform(self, input_tensors):
+  def _apply_transform(self, input_tensors, **kwargs):
     input_tensor = input_tensors[0]
     if isinstance(input_tensor, ops.SparseTensor):
       result = ops.SparseTensor(input_tensor.indices,
diff --git a/tensorflow/contrib/learn/python/learn/datasets/base.py b/tensorflow/contrib/learn/python/learn/datasets/base.py
index b4bf6cd578e..d499a03d228 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/base.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base.py
@@ -24,38 +24,69 @@ import csv
 import os
 from os import path
 import tempfile
+
 import numpy as np
 from six.moves import urllib
 
+from tensorflow.contrib.framework import deprecated
 from tensorflow.python.platform import gfile
 
 Dataset = collections.namedtuple('Dataset', ['data', 'target'])
 Datasets = collections.namedtuple('Datasets', ['train', 'validation', 'test'])
 
 
+@deprecated('2016-09-15', 'Please use load_csv_{with|without}_header instead.')
 def load_csv(filename, target_dtype, target_column=-1, has_header=True):
   """Load dataset from CSV file."""
+  if has_header:
+    return load_csv_with_header(filename=filename,
+                                target_dtype=target_dtype,
+                                features_dtype=np.float64,
+                                target_column=target_column)
+  else:
+    return load_csv_without_header(filename=filename,
+                                   target_dtype=target_dtype,
+                                   features_dtype=np.float64,
+                                   target_column=target_column)
+
+
+def load_csv_with_header(filename,
+                         target_dtype,
+                         features_dtype,
+                         target_column=-1):
+  """Load dataset from CSV file with a header row."""
   with gfile.Open(filename) as csv_file:
     data_file = csv.reader(csv_file)
-    if has_header:
-      header = next(data_file)
-      n_samples = int(header[0])
-      n_features = int(header[1])
-      data = np.empty((n_samples, n_features))
-      target = np.empty((n_samples,), dtype=np.int)
-      for i, ir in enumerate(data_file):
-        target[i] = np.asarray(ir.pop(target_column), dtype=target_dtype)
-        data[i] = np.asarray(ir, dtype=np.float64)
-    else:
-      data, target = [], []
-      for ir in data_file:
-        target.append(ir.pop(target_column))
-        data.append(ir)
-      target = np.array(target, dtype=target_dtype)
-      data = np.array(data)
+    header = next(data_file)
+    n_samples = int(header[0])
+    n_features = int(header[1])
+    data = np.zeros((n_samples, n_features))
+    target = np.zeros((n_samples,), dtype=target_dtype)
+    for i, row in enumerate(data_file):
+      target[i] = np.asarray(row.pop(target_column), dtype=target_dtype)
+      data[i] = np.asarray(row, dtype=features_dtype)
+
   return Dataset(data=data, target=target)
 
 
+def load_csv_without_header(filename,
+                            target_dtype,
+                            features_dtype,
+                            target_column=-1):
+  """Load dataset from CSV file without a header row."""
+  with gfile.Open(filename) as csv_file:
+    data_file = csv.reader(csv_file)
+    data, target = [], []
+    for row in data_file:
+      target.append(row.pop(target_column))
+      data.append(np.asarray(row, dtype=features_dtype))
+
+  target = np.array(target, dtype=target_dtype)
+  data = np.array(data)
+  return Dataset(data=np.array(data),
+                 target=np.array(target).astype(target_dtype))
+
+
 def shrink_csv(filename, ratio):
   """Create a smaller dataset of only 1/ratio of original data."""
   filename_small = filename.replace('.', '_small.')
@@ -70,28 +101,40 @@ def shrink_csv(filename, ratio):
         i += 1
 
 
-def load_iris():
+def load_iris(data_path=None):
   """Load Iris dataset.
 
+  Args:
+      data_path: string, path to iris dataset (optional)
+
   Returns:
     Dataset object containing data in-memory.
   """
-  module_path = path.dirname(__file__)
-  return load_csv(
-      path.join(module_path, 'data', 'iris.csv'),
-      target_dtype=np.int)
+  if data_path is None:
+    module_path = path.dirname(__file__)
+    data_path = path.join(module_path, 'data', 'iris.csv')
+  return load_csv_with_header(
+      data_path,
+      target_dtype=np.int,
+      features_dtype=np.float)
 
 
-def load_boston():
+def load_boston(data_path=None):
   """Load Boston housing dataset.
 
+  Args:
+      data_path: string, path to boston dataset (optional)
+
   Returns:
     Dataset object containing data in-memory.
   """
-  module_path = path.dirname(__file__)
-  return load_csv(
-      path.join(module_path, 'data', 'boston_house_prices.csv'),
-      target_dtype=np.float)
+  if data_path is None:
+    module_path = path.dirname(__file__)
+    data_path = path.join(module_path, 'data', 'boston_house_prices.csv')
+  return load_csv_with_header(
+      data_path,
+      target_dtype=np.float,
+      features_dtype=np.float)
 
 
 def maybe_download(filename, work_directory, source_url):
diff --git a/tensorflow/contrib/learn/python/learn/datasets/mnist.py b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
index 79cfc063a6f..82771ef43a4 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/mnist.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
@@ -216,5 +216,5 @@ def read_data_sets(train_dir,
   return base.Datasets(train=train, validation=validation, test=test)
 
 
-def load_mnist():
-  return read_data_sets('MNIST_data')
+def load_mnist(train_dir='MNIST-data'):
+  return read_data_sets(train_dir)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/text_datasets.py b/tensorflow/contrib/learn/python/learn/datasets/text_datasets.py
index 094dc1ada6f..eb0c8546ffd 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/text_datasets.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/text_datasets.py
@@ -61,7 +61,9 @@ def load_dbpedia(size='small', test_with_fake_data=False):
     train_path = os.path.join(module_path, 'data', 'text_train.csv')
     test_path = os.path.join(module_path, 'data', 'text_test.csv')
 
-  train = base.load_csv(train_path, np.int32, 0, has_header=False)
-  test = base.load_csv(test_path, np.int32, 0, has_header=False)
+  train = base.load_csv_without_header(
+      train_path, target_dtype=np.int32, features_dtype=np.str, target_column=0)
+  test = base.load_csv_without_header(
+      test_path, target_dtype=np.int32, features_dtype=np.str, target_column=0)
 
   return base.Datasets(train=train, validation=None, test=test)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/classifier_test.py b/tensorflow/contrib/learn/python/learn/estimators/classifier_test.py
index d278c9e0af0..5ce6c4878b0 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/classifier_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/classifier_test.py
@@ -29,14 +29,10 @@ from tensorflow.contrib.learn.python.learn.estimators import _sklearn
 
 def iris_input_fn(num_epochs=None):
   iris = tf.contrib.learn.datasets.load_iris()
-  features = tf.cast(
-      tf.reshape(
-          tf.constant(iris.data), [-1, 4]), tf.float32)
+  features = tf.reshape(tf.constant(iris.data), [-1, 4])
   if num_epochs:
     features = tf.train.limit_epochs(features, num_epochs=num_epochs)
-  target = tf.cast(
-      tf.reshape(
-          tf.constant(iris.target), [-1]), tf.int64)
+  target = tf.reshape(tf.constant(iris.target), [-1])
   return features, target
 
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/composable_model.py b/tensorflow/contrib/learn/python/learn/estimators/composable_model.py
index 20cafc4cb4a..f47ae184558 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/composable_model.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/composable_model.py
@@ -20,11 +20,13 @@ from __future__ import division
 from __future__ import print_function
 
 import math
+import re
 
 import six
 
 from tensorflow.contrib import layers
 from tensorflow.contrib.layers.python.layers import feature_column_ops
+from tensorflow.contrib.learn.python.learn.utils import checkpoints
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradients
@@ -47,31 +49,31 @@ class _ComposableModel(object):
   def __init__(self,
                num_label_columns,
                optimizer,
-               weight_collection_name,
                gradient_clip_norm,
-               num_ps_replicas):
+               num_ps_replicas,
+               scope):
     """Common initialization for all _ComposableModel objects.
 
     Args:
       num_label_columns: The number of label/target columns.
       optimizer: An instance of `tf.Optimizer` used to apply gradients to
         the model. If `None`, will use a FTRL optimizer.
-      weight_collection_name: A string defining the name to use for the
-        collection of weights (e.g. 'dnn').
       gradient_clip_norm: A float > 0. If provided, gradients are clipped
         to their global norm with this clipping ratio. See
         tf.clip_by_global_norm for more details.
       num_ps_replicas: The number of parameter server replicas.
+      scope: Scope for variables created in this model.
     """
     self._num_label_columns = num_label_columns
     self._optimizer = optimizer
-    self._weight_collection_name = weight_collection_name
     self._gradient_clip_norm = gradient_clip_norm
     self._num_ps_replicas = num_ps_replicas
+    self._scope = scope
     self._feature_columns = None
 
-  def get_weight_collection_name(self):
-    return self._weight_collection_name
+  def get_scope_name(self):
+    """Returns the scope name used by this model for variables."""
+    return self._scope
 
   def build_model(self, features, feature_columns, is_training):
     """Builds the model that can calculate the logits.
@@ -114,7 +116,7 @@ class _ComposableModel(object):
 
   def _get_vars(self):
     if self._get_feature_columns():
-      return ops.get_collection(self._weight_collection_name)
+      return ops.get_collection(self._scope)
     return []
 
   def _get_optimizer(self):
@@ -142,7 +144,8 @@ class LinearComposableModel(_ComposableModel):
                num_label_columns,
                optimizer=None,
                gradient_clip_norm=None,
-               num_ps_replicas=0):
+               num_ps_replicas=0,
+               scope=None):
     """Initializes LinearComposableModel objects.
 
     Args:
@@ -153,13 +156,49 @@ class LinearComposableModel(_ComposableModel):
         to their global norm with this clipping ratio. See
         tf.clip_by_global_norm for more details.
       num_ps_replicas: The number of parameter server replicas.
+      scope: Optional scope for variables created in this model. If scope
+        is not supplied, it will default to 'linear'.
     """
+    scope = "linear" if not scope else scope
     super(LinearComposableModel, self).__init__(
         num_label_columns=num_label_columns,
         optimizer=optimizer,
-        weight_collection_name="linear",
         gradient_clip_norm=gradient_clip_norm,
-        num_ps_replicas=num_ps_replicas)
+        num_ps_replicas=num_ps_replicas,
+        scope=scope)
+
+  def get_weights(self, model_dir):
+    """Returns weights per feature of the linear part.
+
+    Args:
+      model_dir: Directory where model parameters, graph and etc. are saved.
+
+    Returns:
+      The weights created by this model (without the optimizer weights).
+    """
+    all_variables = [name for name, _ in checkpoints.list_variables(model_dir)]
+    values = {}
+    optimizer_regex = r".*/" + self._get_optimizer().get_name() + r"(_\d)?$"
+    for name in all_variables:
+      if (name.startswith(self._scope + "/") and
+          name != self._scope + "/bias_weight" and
+          not re.match(optimizer_regex, name)):
+        values[name] = checkpoints.load_variable(model_dir, name)
+    if len(values) == 1:
+      return values[list(values.keys())[0]]
+    return values
+
+  def get_bias(self, model_dir):
+    """Returns bias of the model.
+
+    Args:
+      model_dir: Directory where model parameters, graph and etc. are saved.
+
+    Returns:
+      The bias weights created by this model.
+    """
+    return checkpoints.load_variable(model_dir,
+                                     name=(self._scope+"/bias_weight"))
 
   def build_model(self, features, feature_columns, is_training):
     """See base class."""
@@ -168,12 +207,12 @@ class LinearComposableModel(_ComposableModel):
         max_partitions=self._num_ps_replicas,
         min_slice_size=64 << 20)
     with variable_scope.variable_op_scope(
-        features.values(), "linear", partitioner=partitioner) as scope:
+        features.values(), self._scope, partitioner=partitioner) as scope:
       logits, _, _ = layers.weighted_sum_from_feature_columns(
           columns_to_tensors=features,
           feature_columns=self._get_feature_columns(),
           num_outputs=self._num_label_columns,
-          weight_collections=[self._weight_collection_name],
+          weight_collections=[self._scope],
           scope=scope)
     return logits
 
@@ -200,7 +239,8 @@ class DNNComposableModel(_ComposableModel):
                activation_fn=nn.relu,
                dropout=None,
                gradient_clip_norm=None,
-               num_ps_replicas=0):
+               num_ps_replicas=0,
+               scope=None):
     """Initializes DNNComposableModel objects.
 
     Args:
@@ -217,17 +257,50 @@ class DNNComposableModel(_ComposableModel):
         to their global norm with this clipping ratio. See
         tf.clip_by_global_norm for more details.
       num_ps_replicas: The number of parameter server replicas.
+      scope: Optional scope for variables created in this model. If not scope
+        is supplied, one is generated.
     """
+    scope = "dnn" if not scope else scope
     super(DNNComposableModel, self).__init__(
         num_label_columns=num_label_columns,
         optimizer=optimizer,
-        weight_collection_name="DNN",
         gradient_clip_norm=gradient_clip_norm,
-        num_ps_replicas=num_ps_replicas)
+        num_ps_replicas=num_ps_replicas,
+        scope=scope)
     self._hidden_units = hidden_units
     self._activation_fn = activation_fn
     self._dropout = dropout
 
+  def get_weights(self, model_dir):
+    """Returns the weights of the model.
+
+    Args:
+      model_dir: Directory where model parameters, graph and etc. are saved.
+
+    Returns:
+      The weights created by this model.
+    """
+    return [checkpoints.load_variable(
+        model_dir, name=(self._scope+"/hiddenlayer_%d/weights" % i))
+            for i, _ in enumerate(self._hidden_units)] + [
+                checkpoints.load_variable(
+                    model_dir, name=(self._scope+"/logits/weights"))]
+
+  def get_bias(self, model_dir):
+    """Returns the bias of the model.
+
+    Args:
+      model_dir: Directory where model parameters, graph and etc. are saved.
+
+    Returns:
+      The bias weights created by this model.
+    """
+    return [checkpoints.load_variable(
+        model_dir, name=(self._scope+"/hiddenlayer_%d/biases" % i))
+            for i, _ in enumerate(self._hidden_units)] + [
+                checkpoints.load_variable(
+                    model_dir, name=(self._scope+"/logits/biases"))]
+
   def _add_hidden_layer_summary(self, value, tag):
     # TODO(zakaria): Move this code to tf.learn and add test.
     logging_ops.scalar_summary("%s:fraction_of_zero_values" % tag,
@@ -244,12 +317,12 @@ class DNNComposableModel(_ComposableModel):
             min_slice_size=64 << 20))
     with variable_scope.variable_op_scope(
         features.values(),
-        "input_from_feature_columns",
+        self._scope + "/input_from_feature_columns",
         partitioner=input_layer_partitioner) as scope:
       net = layers.input_from_feature_columns(
           features,
           self._get_feature_columns(),
-          weight_collections=[self._weight_collection_name],
+          weight_collections=[self._scope],
           scope=scope)
 
     hidden_layer_partitioner = (
@@ -257,13 +330,13 @@ class DNNComposableModel(_ComposableModel):
             max_partitions=self._num_ps_replicas))
     for layer_id, num_hidden_units in enumerate(self._hidden_units):
       with variable_scope.variable_op_scope(
-          [net], "hiddenlayer_%d" % layer_id,
+          [net], self._scope + "/hiddenlayer_%d" % layer_id,
           partitioner=hidden_layer_partitioner) as scope:
         net = layers.fully_connected(
             net,
             num_hidden_units,
             activation_fn=self._activation_fn,
-            variables_collections=[self._weight_collection_name],
+            variables_collections=[self._scope],
             scope=scope)
         if self._dropout is not None and is_training:
           net = layers.dropout(
@@ -272,15 +345,15 @@ class DNNComposableModel(_ComposableModel):
       self._add_hidden_layer_summary(net, scope.name)
 
     with variable_scope.variable_op_scope(
-        [net], "dnn_logits",
+        [net], self._scope + "/logits",
         partitioner=hidden_layer_partitioner) as scope:
       logits = layers.fully_connected(
           net,
           self._num_label_columns,
           activation_fn=None,
-          variables_collections=[self._weight_collection_name],
+          variables_collections=[self._scope],
           scope=scope)
-    self._add_hidden_layer_summary(logits, "dnn_logits")
+    self._add_hidden_layer_summary(logits, "logits")
     return logits
 
   def _get_default_optimizer(self, optimizer_name=None):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
index df7be73a164..57e6a455852 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import tempfile
+
 import tensorflow as tf
 
 from tensorflow.contrib import layers
@@ -42,7 +44,7 @@ class _BaseEstimatorForTest(estimator.BaseEstimator):
   def __init__(self,
                target_column,
                feature_columns):
-    super(_BaseEstimatorForTest, self).__init__()
+    super(_BaseEstimatorForTest, self).__init__(model_dir=tempfile.mkdtemp())
     self._target_column = target_column
     self._feature_columns = feature_columns
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
index 3d7ae1e380b..79a45161e7b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@@ -71,9 +71,9 @@ class _DNNLinearCombinedBaseEstimator(estimator.BaseEstimator):
 
     Args:
       target_column: A _TargetColumn object.
-      model_dir: Directory to save model parameters, graph and etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
       linear_feature_columns: An iterable containing all the feature columns
         used by linear part of the model. All items in the set should be
         instances of classes derived from `FeatureColumn`.
@@ -102,8 +102,8 @@ class _DNNLinearCombinedBaseEstimator(estimator.BaseEstimator):
       ValueError: If both linear_feature_columns and dnn_features_columns are
         empty at the same time.
     """
-    super(_DNNLinearCombinedBaseEstimator, self).__init__(model_dir=model_dir,
-                                                          config=config)
+    super(_DNNLinearCombinedBaseEstimator, self).__init__(
+        model_dir=model_dir, config=config)
 
     num_ps_replicas = config.num_ps_replicas if config else 0
 
@@ -124,8 +124,6 @@ class _DNNLinearCombinedBaseEstimator(estimator.BaseEstimator):
 
     self._linear_feature_columns = linear_feature_columns
     self._linear_optimizer = linear_optimizer
-    self._linear_weight_collection = (
-        self._linear_model.get_weight_collection_name())
     self._dnn_feature_columns = dnn_feature_columns
     self._dnn_hidden_units = dnn_hidden_units
     self._centered_bias_weight_collection = "centered_bias"
@@ -135,38 +133,24 @@ class _DNNLinearCombinedBaseEstimator(estimator.BaseEstimator):
   @property
   def linear_weights_(self):
     """Returns weights per feature of the linear part."""
-    all_variables = self.get_variable_names()
-    # TODO(ispir): Figure out a better way to retrieve variables for features.
-    # for example using feature info / columns.
-    values = {}
-    for name in all_variables:
-      if (name.startswith("linear/") and name.rfind("/") == 6 and
-          name != "linear/bias_weight"):
-        values[name] = self.get_variable_value(name)
-    if len(values) == 1:
-      return values[list(values.keys())[0]]
-    return values
+    return self._linear_model.get_weights(model_dir=self._model_dir)
 
   @property
   def linear_bias_(self):
     """Returns bias of the linear part."""
-    return (self.get_variable_value("linear/bias_weight") +
+    return (self._linear_model.get_bias(model_dir=self._model_dir) +
             self.get_variable_value("centered_bias_weight"))
 
   @property
   def dnn_weights_(self):
     """Returns weights of deep neural network part."""
-    return [self.get_variable_value("hiddenlayer_%d/weights" % i)
-            for i, _ in enumerate(self._dnn_hidden_units)] + [
-                self.get_variable_value("dnn_logits/weights")]
+    return self._dnn_model.get_weights(model_dir=self._model_dir)
 
   @property
   def dnn_bias_(self):
     """Returns bias of deep neural network part."""
-    return [self.get_variable_value("hiddenlayer_%d/biases" % i)
-            for i, _ in enumerate(self._dnn_hidden_units)] + [
-                self.get_variable_value("dnn_logits/biases"),
-                self.get_variable_value("centered_bias_weight")]
+    return (self._dnn_model.get_bias(model_dir=self._model_dir) +
+            [self.get_variable_value("centered_bias_weight")])
 
   def _get_feature_dict(self, features):
     if isinstance(features, dict):
@@ -347,9 +331,9 @@ class DNNLinearCombinedClassifier(_DNNLinearCombinedBaseEstimator):
     """Constructs a DNNLinearCombinedClassifier instance.
 
     Args:
-      model_dir: Directory to save model parameters, graph and etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
       n_classes: number of target classes. Default is binary classification.
       weight_column_name: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training.
@@ -532,9 +516,9 @@ class DNNLinearCombinedRegressor(_DNNLinearCombinedBaseEstimator):
     """Initializes a DNNLinearCombinedRegressor instance.
 
     Args:
-      model_dir: Directory to save model parameters, graph and etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
       weight_column_name: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training. It
         will be multiplied by the loss of the example.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
index 7cfb2b68b67..9ea6de7751e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
@@ -23,6 +23,7 @@ import tempfile
 
 import numpy as np
 import tensorflow as tf
+
 from tensorflow.contrib.learn.python.learn.estimators import _sklearn
 
 
@@ -458,10 +459,39 @@ class DNNLinearCombinedClassifierTest(tf.test.TestCase):
     self.assertLess(loss2, 0.01)
     self.assertTrue('centered_bias_weight' in classifier.get_variable_names())
 
-    self.assertNotIn('dnn_logits/biases', classifier.get_variable_names())
-    self.assertNotIn('dnn_logits/weights', classifier.get_variable_names())
+    self.assertNotIn('dnn/logits/biases', classifier.get_variable_names())
+    self.assertNotIn('dnn/logits/weights', classifier.get_variable_names())
     self.assertEquals(1, len(classifier.linear_bias_))
-    self.assertEquals(100, len(classifier.linear_weights_))
+    self.assertEquals(2, len(classifier.linear_weights_))
+    self.assertEquals(1, len(classifier.linear_weights_['linear/age/weight']))
+    self.assertEquals(
+        100, len(classifier.linear_weights_['linear/language_weights']))
+
+  def testLinearOnlyOneFeature(self):
+    """Tests that linear-only instantiation works for one feature only."""
+    def input_fn():
+      return {
+          'language': tf.SparseTensor(values=['english'],
+                                      indices=[[0, 0]],
+                                      shape=[1, 1])
+      }, tf.constant([[1]])
+
+    language = tf.contrib.layers.sparse_column_with_hash_bucket('language', 99)
+
+    classifier = tf.contrib.learn.DNNLinearCombinedClassifier(
+        linear_feature_columns=[language])
+    classifier.fit(input_fn=input_fn, steps=100)
+    loss1 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
+    classifier.fit(input_fn=input_fn, steps=200)
+    loss2 = classifier.evaluate(input_fn=input_fn, steps=1)['loss']
+    self.assertLess(loss2, loss1)
+    self.assertLess(loss2, 0.01)
+    self.assertTrue('centered_bias_weight' in classifier.get_variable_names())
+
+    self.assertNotIn('dnn/logits/biases', classifier.get_variable_names())
+    self.assertNotIn('dnn/logits/weights', classifier.get_variable_names())
+    self.assertEquals(1, len(classifier.linear_bias_))
+    self.assertEquals(99, len(classifier.linear_weights_))
 
   def testDNNOnly(self):
     """Tests that DNN-only instantiation works."""
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index b1f6130f8c9..1dd8baa94e4 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -31,7 +31,9 @@ import six
 
 from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib import layers
+from tensorflow.contrib.learn.python.learn import evaluable
 from tensorflow.contrib.learn.python.learn import graph_actions
+from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import _sklearn as sklearn
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import tensor_signature
@@ -138,7 +140,8 @@ def _get_arguments(func):
     return _get_arguments(func.func)
 
 
-class BaseEstimator(sklearn.BaseEstimator):
+class BaseEstimator(
+    sklearn.BaseEstimator, evaluable.Evaluable, trainable.Trainable):
   """Abstract BaseEstimator class to train and evaluate TensorFlow models.
 
   Concrete implementation of this class should provide the following functions:
@@ -158,9 +161,9 @@ class BaseEstimator(sklearn.BaseEstimator):
     """Initializes a BaseEstimator instance.
 
     Args:
-      model_dir: Directory to save model parameters, graph and etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
       config: A RunConfig instance.
     """
     # Model directory.
@@ -173,8 +176,10 @@ class BaseEstimator(sklearn.BaseEstimator):
     # Create a run configuration
     if config is None:
       self._config = BaseEstimator._Config()
+      logging.warning('Using default config.')
     else:
       self._config = config
+    logging.info('Using config: %s', str(vars(self._config)))
 
     # Set device function depending if there are replicas or not.
     if self._config.num_ps_replicas > 0:
@@ -194,34 +199,8 @@ class BaseEstimator(sklearn.BaseEstimator):
 
   def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
           monitors=None, max_steps=None):
-    """Trains a model given training data `x` predictions and `y` targets.
-
-    Args:
-      x: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
-      y: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-         iterator that returns array of targets. The training target values
-         (class labels in classification, real numbers in regression). If set,
-         `input_fn` must be `None`.
-      input_fn: Input function. If set, `x`, `y`, and `batch_size` must be
-        `None`.
-      steps: Number of steps for which to train model. If `None`, train forever.
-        If set, `max_steps` must be `None`.
-      batch_size: minibatch size to use on the input, defaults to first
-        dimension of `x`. Must be `None` if `input_fn` is provided.
-      monitors: List of `BaseMonitor` subclass instances. Used for callbacks
-        inside the training loop.
-      max_steps: Number of total steps for which to train model. If `None`,
-        train forever. If set, `steps` must be `None`.
-
-        Two calls to `fit(steps=100)` means 200 training
-        iterations. On the other hand, two calls to `fit(max_steps=100)` means
-        that the second call will not do any iteration since first call did
-        all 100 steps.
-
-    Returns:
-      `self`, for chaining.
+    # pylint: disable=g-doc-args,g-doc-return-or-yield
+    """See `Trainable`.
 
     Raises:
       ValueError: If `x` or `y` are not `None` while `input_fn` is not `None`.
@@ -282,61 +261,11 @@ class BaseEstimator(sklearn.BaseEstimator):
     return self.fit(x=x, y=y, input_fn=input_fn, steps=steps,
                     batch_size=batch_size, monitors=monitors)
 
-  def evaluate(self,
-               x=None,
-               y=None,
-               input_fn=None,
-               feed_fn=None,
-               batch_size=None,
-               steps=None,
-               metrics=None,
-               name=None):
-    """Evaluates given model with provided evaluation data.
-
-    Evaluates on the given input data. If `input_fn` is provided, that
-    input function should raise an end-of-input exception (`OutOfRangeError` or
-    `StopIteration`) after one epoch of the training data has been provided.
-
-    By default, the whole evaluation dataset is used. If `steps` is provided,
-    only `steps` batches of size `batch_size` are processed.
-
-    The return value is a dict containing the metrics specified in `metrics`, as
-    well as an entry `global_step` which contains the value of the global step
-    for which this evaluation was performed.
-
-    Args:
-      x: Matrix of shape [n_samples, n_features...]. Can be iterator that
-         returns arrays of features. The training input samples for fitting the
-         model. If set, `input_fn` must be `None`.
-      y: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-         iterator that returns array of targets. The training target values
-         (class labels in classification, real numbers in regression). If set,
-         `input_fn` must be `None`.
-      input_fn: Input function. If set, `x`, `y`, and `batch_size` must be
-        `None`.
-      feed_fn: Function creating a feed dict every time it is called. Called
-        once per iteration.
-      batch_size: minibatch size to use on the input, defaults to first
-        dimension of `x`, if specified. Must be `None` if `input_fn` is
-        provided.
-      steps: Number of steps for which to evaluate model. If `None`, evaluate
-        until running tensors generated by `metrics` raises an exception.
-      metrics: Dict of metric ops to run. If `None`, the default metric
-        functions are used; if `{}`, no metrics are used. If model has one
-        output (i.e., returning single predction), keys are `str`, e.g.
-        `'accuracy'` - just a name of the metric that will show up in
-        the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-        `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-        the predictions to run this metric on.
-
-        Metric ops should support streaming, e.g., returning
-        update_op and value tensors. See more details in
-        ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-      name: Name of the evaluation if user needs to run multiple evaluations on
-        different data sets, such as on training data vs test data.
-
-    Returns:
-      Returns `dict` with evaluation results.
+  def evaluate(
+      self, x=None, y=None, input_fn=None, feed_fn=None, batch_size=None,
+      steps=None, metrics=None, name=None):
+    # pylint: disable=g-doc-args,g-doc-return-or-yield
+    """See `Evaluable`.
 
     Raises:
       ValueError: If at least one of `x` or `y` is provided, and at least one of
@@ -768,9 +697,9 @@ class Estimator(BaseEstimator):
                  is passed to Estimator in `params` parameter. This allows
                  to configure Estimators from hyper parameter tunning.
 
-      model_dir: Directory to save model parameters, graph and etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
       config: Configuration object.
       params: `dict` of hyper parameters that will be passed into `model_fn`.
               Keys are names of parameters, values are basic python types.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index dbb1b40a4bc..c3dca0451dc 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -36,32 +36,26 @@ _IRIS_INPUT_DIM = 4
 
 def boston_input_fn(num_epochs=None):
   boston = tf.contrib.learn.datasets.load_boston()
-  features = tf.cast(
-      tf.reshape(tf.constant(boston.data), [-1, _BOSTON_INPUT_DIM]), tf.float32)
+  features = tf.reshape(tf.constant(boston.data), [-1, _BOSTON_INPUT_DIM])
   if num_epochs:
     features = tf.train.limit_epochs(features, num_epochs=num_epochs)
-  target = tf.cast(
-      tf.reshape(tf.constant(boston.target), [-1, 1]), tf.float32)
+  target = tf.reshape(tf.constant(boston.target), [-1, 1])
   return features, target
 
 
 def iris_input_fn():
   iris = tf.contrib.learn.datasets.load_iris()
-  features = tf.cast(
-      tf.reshape(tf.constant(iris.data), [-1, _IRIS_INPUT_DIM]), tf.float32)
-  target = tf.cast(
-      tf.reshape(tf.constant(iris.target), [-1]), tf.int32)
+  features = tf.reshape(tf.constant(iris.data), [-1, _IRIS_INPUT_DIM])
+  target = tf.reshape(tf.constant(iris.target), [-1])
   return features, target
 
 
 def boston_eval_fn():
   boston = tf.contrib.learn.datasets.load_boston()
   n_examples = len(boston.target)
-  features = tf.cast(
-      tf.reshape(tf.constant(boston.data), [n_examples, _BOSTON_INPUT_DIM]),
-      tf.float32)
-  target = tf.cast(
-      tf.reshape(tf.constant(boston.target), [n_examples, 1]), tf.float32)
+  features = tf.reshape(
+      tf.constant(boston.data), [n_examples, _BOSTON_INPUT_DIM])
+  target = tf.reshape(tf.constant(boston.target), [n_examples, 1])
   return tf.concat(0, [features, features]), tf.concat(0, [target, target])
 
 
@@ -188,7 +182,7 @@ class EstimatorTest(tf.test.TestCase):
     with self.assertRaises(tf.contrib.learn.NotFittedError):
       _ = est.evaluate(
           x=boston.data,
-          y=boston.target.astype(np.float32))
+          y=boston.target.astype(np.float64))
     with self.assertRaises(tf.contrib.learn.NotFittedError):
       est.predict(x=boston.data)
 
@@ -197,10 +191,11 @@ class EstimatorTest(tf.test.TestCase):
     output_dir = tempfile.mkdtemp()
     est = tf.contrib.learn.Estimator(model_fn=linear_model_fn,
                                      model_dir=output_dir)
-    est.fit(x=boston.data, y=boston.target.astype(np.float32), steps=50)
+    float64_target = boston.target.astype(np.float64)
+    est.fit(x=boston.data, y=float64_target, steps=50)
     scores = est.evaluate(
         x=boston.data,
-        y=boston.target.astype(np.float32),
+        y=float64_target,
         metrics={'MSE': tf.contrib.metrics.streaming_mean_squared_error})
     del est
     # Create another estimator object with the same output dir.
@@ -210,19 +205,19 @@ class EstimatorTest(tf.test.TestCase):
     # Check we can evaluate and predict.
     scores2 = est2.evaluate(
         x=boston.data,
-        y=boston.target.astype(np.float32),
+        y=float64_target,
         metrics={'MSE': tf.contrib.metrics.streaming_mean_squared_error})
     self.assertAllClose(scores2['MSE'],
                         scores['MSE'])
     predictions = est2.predict(x=boston.data)
-    other_score = _sklearn.mean_squared_error(predictions, boston.target)
+    other_score = _sklearn.mean_squared_error(predictions, float64_target)
     self.assertAllClose(other_score, scores['MSE'])
 
     # Check we can keep training.
-    est2.fit(x=boston.data, y=boston.target.astype(np.float32), steps=100)
+    est2.fit(x=boston.data, y=float64_target, steps=100)
     scores3 = est2.evaluate(
         x=boston.data,
-        y=boston.target.astype(np.float32),
+        y=float64_target,
         metrics={'MSE': tf.contrib.metrics.streaming_mean_squared_error})
     self.assertLess(scores3['MSE'], scores['MSE'])
 
@@ -230,15 +225,16 @@ class EstimatorTest(tf.test.TestCase):
     boston = tf.contrib.learn.datasets.load_boston()
     est = tf.contrib.learn.Estimator(model_fn=linear_model_params_fn,
                                      params={'learning_rate': 0.01})
-    est.fit(x=boston.data, y=boston.target.astype(np.float32), steps=100)
+    est.fit(x=boston.data, y=boston.target, steps=100)
 
   def testBostonAll(self):
     boston = tf.contrib.learn.datasets.load_boston()
     est = tf.contrib.learn.Estimator(model_fn=linear_model_fn)
-    est.fit(x=boston.data, y=boston.target.astype(np.float32), steps=100)
+    float64_target = boston.target.astype(np.float64)
+    est.fit(x=boston.data, y=float64_target, steps=100)
     scores = est.evaluate(
         x=boston.data,
-        y=boston.target.astype(np.float32),
+        y=float64_target,
         metrics={'MSE': tf.contrib.metrics.streaming_mean_squared_error})
     predictions = est.predict(x=boston.data)
     other_score = _sklearn.mean_squared_error(predictions, boston.target)
@@ -277,7 +273,7 @@ class EstimatorTest(tf.test.TestCase):
     iris = tf.contrib.learn.datasets.load_iris()
     est = tf.contrib.learn.Estimator(model_fn=logistic_model_no_mode_fn)
     x_iter = itertools.islice(iris.data, 100)
-    y_iter = itertools.islice(np.int32(iris.target), 100)
+    y_iter = itertools.islice(iris.target, 100)
     est.fit(x_iter, y_iter, steps=100)
     _ = est.evaluate(input_fn=iris_input_fn, steps=1)
     predictions = est.predict(x=iris.data)['class']
@@ -374,19 +370,16 @@ class InferRealValuedColumnsTest(tf.test.TestCase):
         '': tf.FixedLenFeature(shape=expected_shape, dtype=expected_dtype)
     }, feature_column.config)
 
-  # Note: See tf.contrib.learn.io.data_feeder for why int32 converts to float32.
   def testInt32Input(self):
     feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
         np.ones(shape=[7, 8], dtype=np.int32))
-    self._assert_single_feature_column([8], tf.float32, feature_columns)
+    self._assert_single_feature_column([8], tf.int32, feature_columns)
 
   def testInt32InputFn(self):
     feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input_fn(
         lambda: (tf.ones(shape=[7, 8], dtype=tf.int32), None))
     self._assert_single_feature_column([8], tf.int32, feature_columns)
 
-  # Note: See tf.contrib.learn.io.data_feeder for why int64 doesn't convert to
-  # float64.
   def testInt64Input(self):
     feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
         np.ones(shape=[7, 8], dtype=np.int64))
@@ -407,12 +400,10 @@ class InferRealValuedColumnsTest(tf.test.TestCase):
         lambda: (tf.ones(shape=[7, 8], dtype=tf.float32), None))
     self._assert_single_feature_column([8], tf.float32, feature_columns)
 
-  # Note: See tf.contrib.learn.io.data_feeder for why float64 converts to
-  # float32.
   def testFloat64Input(self):
     feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
         np.ones(shape=[7, 8], dtype=np.float64))
-    self._assert_single_feature_column([8], tf.float32, feature_columns)
+    self._assert_single_feature_column([8], tf.float64, feature_columns)
 
   def testFloat64InputFn(self):
     feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input_fn(
@@ -420,9 +411,10 @@ class InferRealValuedColumnsTest(tf.test.TestCase):
     self._assert_single_feature_column([8], tf.float64, feature_columns)
 
   def testBoolInput(self):
-    feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
-        np.array([[False for _ in xrange(8)] for _ in xrange(7)]))
-    self._assert_single_feature_column([8], tf.float32, feature_columns)
+    with self.assertRaisesRegexp(
+        ValueError, 'on integer or non floating types are not supported'):
+      tf.contrib.learn.infer_real_valued_columns_from_input(
+          np.array([[False for _ in xrange(8)] for _ in xrange(7)]))
 
   def testBoolInputFn(self):
     with self.assertRaisesRegexp(
@@ -431,18 +423,12 @@ class InferRealValuedColumnsTest(tf.test.TestCase):
       tf.contrib.learn.infer_real_valued_columns_from_input_fn(
           lambda: (tf.constant(False, shape=[7, 8], dtype=tf.bool), None))
 
-  def testInvalidStringInput(self):
-    # pylint: disable=g-long-lambda
-    with self.assertRaisesRegexp(
-        ValueError, 'could not convert string to float'):
-      tf.contrib.learn.infer_real_valued_columns_from_input(
-          np.array([['foo%d' % i for i in xrange(8)] for _ in xrange(7)]))
-
   def testStringInput(self):
-    # pylint: disable=g-long-lambda
-    feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(
-        np.array([['%d.0' % i for i in xrange(8)] for _ in xrange(7)]))
-    self._assert_single_feature_column([8], tf.float32, feature_columns)
+    with self.assertRaisesRegexp(
+        ValueError, 'on integer or non floating types are not supported'):
+      # pylint: disable=g-long-lambda
+      tf.contrib.learn.infer_real_valued_columns_from_input(
+          np.array([['%d.0' % i for i in xrange(8)] for _ in xrange(7)]))
 
   def testStringInputFn(self):
     with self.assertRaisesRegexp(
@@ -457,13 +443,13 @@ class InferRealValuedColumnsTest(tf.test.TestCase):
     feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input_fn(
         boston_input_fn)
     self._assert_single_feature_column(
-        [_BOSTON_INPUT_DIM], tf.float32, feature_columns)
+        [_BOSTON_INPUT_DIM], tf.float64, feature_columns)
 
   def testIrisInputFn(self):
     feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input_fn(
         iris_input_fn)
     self._assert_single_feature_column(
-        [_IRIS_INPUT_DIM], tf.float32, feature_columns)
+        [_IRIS_INPUT_DIM], tf.float64, feature_columns)
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index f025fc0941e..beb4dd5aa86 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -122,9 +122,9 @@ class LinearClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
       feature_columns: An iterable containing all the feature columns used by
         the model. All items in the set should be instances of classes derived
         from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
       n_classes: number of target classes. Default is binary classification.
       weight_column_name: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training. It
@@ -186,8 +186,8 @@ class LinearClassifier(dnn_linear_combined.DNNLinearCombinedClassifier):
         columns_to_tensors=features,
         feature_columns=self._linear_feature_columns,
         num_outputs=self._target_column.num_label_columns,
-        weight_collections=[self._linear_weight_collection],
-        scope="linear")
+        weight_collections=[self._linear_model.get_scope_name()],
+        scope=self._linear_model.get_scope_name())
     with ops.control_dependencies([self._centered_bias()]):
       loss = self._target_column.loss(logits, targets, features)
     logging_ops.scalar_summary("loss", loss)
@@ -282,9 +282,9 @@ class LinearRegressor(dnn_linear_combined.DNNLinearCombinedRegressor):
       feature_columns: An iterable containing all the feature columns used by
         the model. All items in the set should be instances of classes derived
         from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph, etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
+      model_dir: Directory to save model parameters, graph, etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
       weight_column_name: A string defining feature column name representing
         weights. It is used to down weight or boost examples during training. It
         will be multiplied by the loss of the example.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
index cafdb980c55..e3f784cf415 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor.py
@@ -56,7 +56,7 @@ class LogisticRegressor(estimator.Estimator):
       model_fn: Model function. See superclass Estimator for more details. This
         expects the returned predictions to be probabilities in [0.0, 1.0].
       thresholds: List of floating point thresholds to use for accuracy,
-        precision, and recall metrics. If None, defaults to [0.5].
+        precision, and recall metrics. If `None`, defaults to `[0.5]`.
       model_dir: Directory to save model parameters, graphs, etc. This can also
         be used to load checkpoints from the directory into a estimator to continue
         training a previously saved model.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/random_forest.py b/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
index ec704531638..5d82b2c4a5e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/random_forest.py
@@ -17,8 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
 import numpy as np
 import six
 
@@ -26,18 +24,36 @@ from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib.learn.python.learn import monitors as mon
 
 from tensorflow.contrib.learn.python.learn.estimators import estimator
-from tensorflow.contrib.learn.python.learn.estimators import run_config
 
 from tensorflow.contrib.tensor_forest.client import eval_metrics
 from tensorflow.contrib.tensor_forest.data import data_ops
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 
 
+def _assert_float32(tensors):
+  """Assert all tensors are float32.
+
+  Args:
+    tensors: `Tensor` or `dict` of `Tensor` objects.
+
+  Raises:
+    TypeError: if any tensor is not float32.
+  """
+  if not isinstance(tensors, dict):
+    tensors = [tensors]
+  else:
+    tensors = tensors.values()
+  for tensor in tensors:
+    if tensor.dtype.base_dtype != dtypes.float32:
+      raise TypeError('Expected dtype=float32, %s.' % tensor)
+
+
 class LossMonitor(mon.EveryN):
   """Terminates training when training loss stops decreasing."""
 
@@ -146,6 +162,8 @@ class TensorForestEstimator(estimator.BaseEstimator):
     Returns:
       Tuple of train `Operation` and loss `Tensor`.
     """
+    _assert_float32(features)
+    _assert_float32(targets)
     features, spec = data_ops.ParseDataTensorOrDict(features)
     labels = data_ops.ParseLabelTensorOrDict(targets)
 
@@ -168,6 +186,7 @@ class TensorForestEstimator(estimator.BaseEstimator):
     return train, self.training_loss
 
   def _get_predict_ops(self, features):
+    _assert_float32(features)
     graph_builder = self.graph_builder_class(
         self.params, device_assigner=self.device_assigner, training=False,
         **self.construction_args)
@@ -175,6 +194,8 @@ class TensorForestEstimator(estimator.BaseEstimator):
     return graph_builder.inference_graph(features, data_spec=spec)
 
   def _get_eval_ops(self, features, targets, metrics):
+    _assert_float32(features)
+    _assert_float32(targets)
     features, spec = data_ops.ParseDataTensorOrDict(features)
     labels = data_ops.ParseLabelTensorOrDict(targets)
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/random_forest_test.py b/tensorflow/contrib/learn/python/learn/estimators/random_forest_test.py
index 81754064d6b..640167a70bf 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/random_forest_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/random_forest_test.py
@@ -19,11 +19,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 import tensorflow as tf
 
 
 class TensorForestTrainerTests(tf.test.TestCase):
 
+  def testFloat64(self):
+    hparams = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
+        num_trees=3, max_nodes=1000, num_classes=3, num_features=4)
+    classifier = tf.contrib.learn.TensorForestEstimator(hparams)
+    iris = tf.contrib.learn.datasets.load_iris()
+    with self.assertRaisesRegexp(TypeError, 'float32'):
+      classifier.fit(x=iris.data, y=iris.target, steps=100)
+
   def testClassification(self):
     """Tests multi-class classification using matrix data as input."""
     hparams = tf.contrib.tensor_forest.python.tensor_forest.ForestHParams(
@@ -31,9 +40,11 @@ class TensorForestTrainerTests(tf.test.TestCase):
     classifier = tf.contrib.learn.TensorForestEstimator(hparams)
 
     iris = tf.contrib.learn.datasets.load_iris()
+    data = iris.data.astype(np.float32)
+    target = iris.target.astype(np.float32)
 
-    classifier.fit(x=iris.data, y=iris.target, steps=100)
-    classifier.evaluate(x=iris.data, y=iris.target, steps=10)
+    classifier.fit(x=data, y=target, steps=100)
+    classifier.evaluate(x=data, y=target, steps=10)
 
   def testRegression(self):
     """Tests multi-class classification using matrix data as input."""
@@ -45,9 +56,11 @@ class TensorForestTrainerTests(tf.test.TestCase):
     regressor = tf.contrib.learn.TensorForestEstimator(hparams)
 
     boston = tf.contrib.learn.datasets.load_boston()
+    data = boston.data.astype(np.float32)
+    target = boston.target.astype(np.float32)
 
-    regressor.fit(x=boston.data, y=boston.target, steps=100)
-    regressor.evaluate(x=boston.data, y=boston.target, steps=10)
+    regressor.fit(x=data, y=target, steps=100)
+    regressor.evaluate(x=data, y=target, steps=10)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/learn/python/learn/estimators/svm.py b/tensorflow/contrib/learn/python/learn/estimators/svm.py
index f646cdf477c..a39254e7b49 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/svm.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/svm.py
@@ -61,13 +61,13 @@ class SVM(linear.LinearClassifier):
         whose `value` is a `SparseTensor`.
       - if `column` is a `RealValuedColumn, a feature with `key=column.name`
         whose `value` is a `Tensor`.
-      - if `feauture_columns` is None, then `input` must contains only real
+      - if `feature_columns` is None, then `input` must contains only real
         valued `Tensor`.
 
 
   Parameters:
     example_id_column: A string defining the feature column name representing
-      example ids. Used do initialize the underlying optimizer.
+      example ids. Used to initialize the underlying optimizer.
     feature_columns: An iterable containing all the feature columns used by the
       model. All items in the set should be instances of classes derived from
       `FeatureColumn`.
@@ -75,10 +75,12 @@ class SVM(linear.LinearClassifier):
       weights. It is used to down weight or boost examples during training. It
       will be multiplied by the loss of the example.
     model_dir: Directory to save model parameters, graph and etc. This can also
-        be used to load checkpoints from the directory into a estimator to continue
-        training a previously saved model.
-    l1_regularization: L1-regularization parameter
-    l2_regularization: L2-regularization parameter
+        be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+    l1_regularization: L1-regularization parameter. Refers to global L1
+    regularization (across all examples).
+    l2_regularization: L2-regularization parameter. Refers to global L2
+    regularization (across all examples).
     kernels: A list of kernels for the SVM. Currently, no kernels are supported.
       Reserved for future use for non-linear SVMs
     config: RunConfig object to configure the runtime settings.
@@ -100,12 +102,13 @@ class SVM(linear.LinearClassifier):
         symmetric_l1_regularization=l1_regularization,
         symmetric_l2_regularization=l2_regularization)
 
-    super(SVM, self).__init__(model_dir=model_dir,
-                              n_classes=2,
-                              weight_column_name=weight_column_name,
-                              feature_columns=feature_columns,
-                              optimizer=optimizer,
-                              config=config)
+    super(SVM, self).__init__(
+        model_dir=model_dir,
+        n_classes=2,
+        weight_column_name=weight_column_name,
+        feature_columns=feature_columns,
+        optimizer=optimizer,
+        config=config)
     self._target_column = layers.binary_svm_target(
         weight_column_name=weight_column_name)
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/tensor_signature.py b/tensorflow/contrib/learn/python/learn/estimators/tensor_signature.py
index 2c80564abc2..483ccc9f119 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/tensor_signature.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/tensor_signature.py
@@ -51,7 +51,11 @@ class TensorSignature(collections.namedtuple(
     """Returns True if signatures are compatible."""
 
     def _shape_is_compatible_0dim(this, other):
+      """Checks that shapes are compatible skipping dim 0."""
       other = tensor_shape.as_shape(other)
+      # If shapes are None (unknown) they may be compatible.
+      if this.dims is None or other.dims is None:
+        return True
       if this.ndims != other.ndims:
         return False
       for dim, (x_dim, y_dim) in enumerate(zip(this.dims, other.dims)):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/tensor_signature_test.py b/tensorflow/contrib/learn/python/learn/estimators/tensor_signature_test.py
index 994b9ec64fb..bfbd9de3973 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/tensor_signature_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/tensor_signature_test.py
@@ -142,6 +142,30 @@ class TensorSignatureTest(tf.test.TestCase):
     self.assertTrue(new_signatures['a'].is_compatible_with(signatures['a']))
     self.assertTrue(new_signatures['b'].is_compatible_with(signatures['b']))
 
+  def testUnknownShape(self):
+    placeholder_unk = tf.placeholder(name='unk', shape=None, dtype=tf.string)
+    placeholder_a = tf.placeholder(name='a', shape=[None], dtype=tf.string)
+    placeholder_b = tf.placeholder(name='b', shape=[128, 2], dtype=tf.string)
+    placeholder_c = tf.placeholder(name='c', shape=[128, 2], dtype=tf.int32)
+    unk_signature = tensor_signature.create_signatures(placeholder_unk)
+    # Tensors of same dtype match unk shape signature.
+    self.assertTrue(tensor_signature.tensors_compatible(placeholder_unk,
+                                                        unk_signature))
+    self.assertTrue(tensor_signature.tensors_compatible(placeholder_a,
+                                                        unk_signature))
+    self.assertTrue(tensor_signature.tensors_compatible(placeholder_b,
+                                                        unk_signature))
+    self.assertFalse(tensor_signature.tensors_compatible(placeholder_c,
+                                                         unk_signature))
+
+    string_signature = tensor_signature.create_signatures(placeholder_a)
+    int_signature = tensor_signature.create_signatures(placeholder_c)
+    # Unk shape Tensor matche signatures same dtype.
+    self.assertTrue(tensor_signature.tensors_compatible(placeholder_unk,
+                                                        string_signature))
+    self.assertFalse(tensor_signature.tensors_compatible(placeholder_unk,
+                                                         int_signature))
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/evaluable.py b/tensorflow/contrib/learn/python/learn/evaluable.py
new file mode 100644
index 00000000000..1ff14193939
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/evaluable.py
@@ -0,0 +1,81 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""`Evaluable` interface."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+
+class Evaluable(object):
+  """Interface for objects that are evaluatable by, e.g., `Experiment`.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractmethod
+  def evaluate(
+      self, x=None, y=None, input_fn=None, feed_fn=None, batch_size=None,
+      steps=None, metrics=None, name=None):
+    """Evaluates given model with provided evaluation data.
+
+    Evaluates on the given input data. If `input_fn` is provided, that
+    input function should raise an end-of-input exception (`OutOfRangeError` or
+    `StopIteration`) after one epoch of the training data has been provided.
+
+    By default, the whole evaluation dataset is used. If `steps` is provided,
+    only `steps` batches of size `batch_size` are processed.
+
+    The return value is a dict containing the metrics specified in `metrics`, as
+    well as an entry `global_step` which contains the value of the global step
+    for which this evaluation was performed.
+
+    Args:
+      x: Matrix of shape [n_samples, n_features...]. Can be iterator that
+         returns arrays of features. The training input samples for fitting the
+         model. If set, `input_fn` must be `None`.
+      y: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
+         iterator that returns array of targets. The training target values
+         (class labels in classification, real numbers in regression). If set,
+         `input_fn` must be `None`.
+      input_fn: Input function. If set, `x`, `y`, and `batch_size` must be
+        `None`.
+      feed_fn: Function creating a feed dict every time it is called. Called
+        once per iteration. Must be `None` if `input_fn` is provided.
+      batch_size: minibatch size to use on the input, defaults to first
+        dimension of `x`, if specified. Must be `None` if `input_fn` is
+        provided.
+      steps: Number of steps for which to evaluate model. If `None`, evaluate
+        until running tensors generated by `metrics` raises an exception.
+      metrics: Dict of metric ops to run. If `None`, the default metric
+        functions are used; if `{}`, no metrics are used. If model has one
+        output (i.e., returning single predction), keys are `str`, e.g.
+        `'accuracy'` - just a name of the metric that will show up in
+        the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
+        `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
+        the predictions to run this metric on.
+
+        Metric ops should support streaming, e.g., returning
+        update_op and value tensors. See more details in
+        ../../../metrics/python/metrics/ops/streaming_metrics.py.
+      name: Name of the evaluation if user needs to run multiple evaluations on
+        different data sets, such as on training data vs test data.
+
+    Returns:
+      Returns `dict` with evaluation results.
+    """
+    raise NotImplementedError
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 2271e5161ed..0f96b70fae1 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -21,7 +21,9 @@ from __future__ import print_function
 
 import time
 
+from tensorflow.contrib.learn.python.learn import evaluable
 from tensorflow.contrib.learn.python.learn import monitors
+from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators._sklearn import NotFittedError
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import tf_logging as logging
@@ -47,7 +49,7 @@ class Experiment(object):
     """Constructor for `Experiment`.
 
     Args:
-      estimator: `Estimator` object.
+      estimator: Object implementing `Trainable` and `Evaluable`.
       train_input_fn: function, returns features and targets for training.
       eval_input_fn: function, returns features and targets for evaluation. If
         `eval_steps` is `None`, this should be configured only to produce for a
@@ -67,7 +69,14 @@ class Experiment(object):
       continuous_eval_throttle_secs: Do not re-evaluate unless the last
         evaluation was started at least this many seconds ago for
         continuous_eval().
+
+    Raises:
+      ValueError: if `estimator` does not implement `Evaluable` and `Trainable`.
     """
+    if not isinstance(estimator, evaluable.Evaluable):
+      raise ValueError("`estimator` must implement `Evaluable`.")
+    if not isinstance(estimator, trainable.Trainable):
+      raise ValueError("`estimator` must implement `Trainable`.")
     super(Experiment, self).__init__()
     self._estimator = estimator
     self._train_input_fn = train_input_fn
diff --git a/tensorflow/contrib/learn/python/learn/graph_actions.py b/tensorflow/contrib/learn/python/learn/graph_actions.py
index 5c60aa40171..6da6bee1ec0 100644
--- a/tensorflow/contrib/learn/python/learn/graph_actions.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions.py
@@ -171,7 +171,7 @@ def _supervised_train(graph,
     supervisor_is_chief: Whether the current process is the chief supervisor in
       charge of restoring the model and running standard services.
     supervisor_master: The master string to use when preparing the session.
-    supervisor_save_model_secs: Save a checkpoint every
+    supervisor_save_model_secs: Save model every
       `supervisor_save_model_secs` seconds when training.
     keep_checkpoint_max: The maximum number of recent checkpoint files to
       keep. As new files are created, older files are deleted. If None or 0,
@@ -251,17 +251,18 @@ def _supervised_train(graph,
         init_fn=init_fn,
         keep_checkpoint_max=keep_checkpoint_max)
     if supervisor_is_chief:
-      if scaffold.summary_op is not None:
-        monitors.append(monitors_lib.SummarySaver(
-            scaffold.summary_op,
-            save_steps=supervisor_save_summaries_steps,
-            summary_writer=summary_writer))
-      if supervisor_save_model_secs:
-        monitors.append(monitors_lib.CheckpointSaver(
-            # Make CheckpointSaver use a timer or change arg to be steps.
-            3 * supervisor_save_summaries_steps,
-            scaffold.saver,
-            output_dir))
+      monitors.append(
+          monitors_lib.SummarySaver(
+              summary_op=None,
+              save_steps=supervisor_save_summaries_steps,
+              summary_writer=summary_writer,
+              scaffold=scaffold))
+      if supervisor_save_model_secs > 0:
+        monitors.append(
+            monitors_lib.CheckpointSaver(
+                output_dir,
+                save_secs=supervisor_save_model_secs,
+                scaffold=scaffold))
 
     if steps is not None or max_steps is not None:
       monitors.append(monitors_lib.StopAtStep(steps, max_steps))
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
index 8c9790b6a6a..d0e9b61f42f 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@@ -30,6 +30,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
 
 # pylint: disable=g-multiple-import,g-bad-import-order
 from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels
@@ -206,6 +207,13 @@ def _access(data, iloc):
   return data[iloc]
 
 
+def _check_dtype(dtype):
+  if dtypes.as_dtype(dtype) == dtypes.float64:
+    logging.warn(
+        'float64 is not supported by many models, consider casting to float32.')
+  return dtype
+
+
 class DataFeeder(object):
   """Data feeder is an example class to sample data for TF trainer."""
 
@@ -215,60 +223,82 @@ class DataFeeder(object):
     """Initializes a DataFeeder instance.
 
     Args:
-      x: feature Nd numpy matrix of shape [n_samples, n_features, ...].
-      y: target vector, either floats for regression or class id for
+      x: Feature Nd numpy matrix of shape `[n_samples, n_features, ...]`.
+      y: Target vector, either floats for regression or class id for
         classification. If matrix, will consider as a sequence
-        of targets. Can be None for unsupervised setting.
-      n_classes: number of classes, 0 and 1 are considered regression, None will
-        pass through the input labels without one-hot conversion.
-      batch_size: mini batch size to accumulate.
-      random_state: numpy RandomState object to reproduce sampling.
+        of targets. Can be `None` for unsupervised setting.
+      n_classes: Number of classes, 0 and 1 are considered regression, `None`
+        will pass through the input labels without one-hot conversion.
+      batch_size: Mini-batch size to accumulate.
+      shuffle: Whether to shuffle `x`.
+      random_state: Numpy `RandomState` object to reproduce sampling.
+      epochs: Number of times to iterate over input data before raising
+        `StopIteration` exception.
 
     Attributes:
-      x: input features.
-      y: input target.
-      n_classes: number of classes (if None, pass through indices without
+      x: Input features.
+      y: Input target.
+      n_classes: Number of classes (if `None`, pass through indices without
         one-hot conversion).
-      batch_size: mini batch size to accumulate.
-      input_shape: shape of the input.
-      output_shape: shape of the output.
-      input_dtype: dtype of input.
-      output_dtype: dtype of output.
+      batch_size: Mini-batch size to accumulate.
+      input_shape: Shape of the input.
+      output_shape: Shape of the output.
+      input_dtype: DType of input.
+      output_dtype: DType of output.
     """
-    x_dtype = np.int64 if x.dtype == np.int64 else np.float32
+    self._x = check_array(x, dtype=x.dtype)
+    # self.n_classes is None means we're passing in raw target indices.
     y_dtype = (
         np.int64 if n_classes is not None and n_classes > 1 else np.float32)
-    self.x = check_array(x, dtype=x_dtype)
-    # self.n_classes is None means we're passing in raw target indices
     if n_classes is not None:
-      self.y = (None if y is None else check_array(y, dtype=y_dtype))
+      self._y = (None if y is None else check_array(y, dtype=y_dtype))
+    elif isinstance(y, list):
+      self._y = np.array(y)
     else:
-      self.y = y
-      if isinstance(self.y, list):
-        self.y = np.array(y)
+      self._y = y
     self.n_classes = n_classes
     self.max_epochs = epochs
     self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
-        self.x.shape, None if self.y is None else self.y.shape, n_classes,
+        self._x.shape, None if self._y is None else self._y.shape, n_classes,
         batch_size)
     # Input dtype matches dtype of x.
-    self.input_dtype = x_dtype
+    self._input_dtype = _check_dtype(self._x.dtype)
     # self.n_classes is None means we're passing in raw target indices
-    if n_classes is not None or y is None:
-      self.output_dtype = np.float32
+    if n_classes is not None or self._y is None:
+      self._output_dtype = np.float32
     else:
-      self.output_dtype = self.y.dtype
-    self.shuffle = shuffle
+      self._output_dtype = _check_dtype(self._y.dtype)
+    self._shuffle = shuffle
     self.random_state = np.random.RandomState(
         42) if random_state is None else random_state
-    if self.shuffle:
-      self.indices = self.random_state.permutation(self.x.shape[0])
+    if self._shuffle:
+      self.indices = self.random_state.permutation(self._x.shape[0])
     else:
-      self.indices = np.array(range(self.x.shape[0]))
+      self.indices = np.array(range(self._x.shape[0]))
     self.offset = 0
     self.epoch = 0
     self._epoch_placeholder = None
 
+  @property
+  def x(self):
+    return self._x
+
+  @property
+  def y(self):
+    return self._y
+
+  @property
+  def shuffle(self):
+    return self._shuffle
+
+  @property
+  def input_dtype(self):
+    return self._input_dtype
+
+  @property
+  def output_dtype(self):
+    return self._output_dtype
+
   @property
   def batch_size(self):
     return self._batch_size
@@ -291,7 +321,7 @@ class DataFeeder(object):
     """
     input_shape = [None] + self.input_shape[1:]
     self._input_placeholder = array_ops.placeholder(
-        dtypes.as_dtype(self.input_dtype),
+        dtypes.as_dtype(self._input_dtype),
         input_shape,
         name='input')
     if self.output_shape is None:
@@ -299,7 +329,7 @@ class DataFeeder(object):
     else:
       output_shape = [None] + self.output_shape[1:]
       self._output_placeholder = array_ops.placeholder(
-          dtypes.as_dtype(self.output_dtype),
+          dtypes.as_dtype(self._output_dtype),
           output_shape,
           name='output')
     return self._input_placeholder, self._output_placeholder
@@ -345,20 +375,20 @@ class DataFeeder(object):
         feed_dict[self._epoch_placeholder.name] = [self.epoch]
 
       # Take next batch of indices.
-      end = min(self.x.shape[0], self.offset + self._batch_size)
+      end = min(self._x.shape[0], self.offset + self._batch_size)
       batch_indices = self.indices[self.offset:end]
 
       # Assign input features from random indices.
       inp = (
-          np.array(_access(self.x, batch_indices)).reshape(
+          np.array(_access(self._x, batch_indices)).reshape(
               (batch_indices.shape[0], 1))
-          if len(self.x.shape) == 1 else _access(self.x, batch_indices))
+          if len(self._x.shape) == 1 else _access(self._x, batch_indices))
       feed_dict[self._input_placeholder.name] = inp
 
       # move offset and reset it if necessary
       self.offset += self._batch_size
-      if self.offset >= self.x.shape[0]:
-        self.indices = self.random_state.permutation(self.x.shape[0])
+      if self.offset >= self._x.shape[0]:
+        self.indices = self.random_state.permutation(self._x.shape[0])
         self.offset = 0
         self.epoch += 1
 
@@ -368,21 +398,21 @@ class DataFeeder(object):
 
       # assign labels from random indices
       self.output_shape[0] = batch_indices.shape[0]
-      out = np.zeros(self.output_shape, dtype=self.output_dtype)
+      out = np.zeros(self.output_shape, dtype=self._output_dtype)
       for i in xrange(out.shape[0]):
         sample = batch_indices[i]
         # self.n_classes is None means we're passing in raw target indices
         if self.n_classes is None:
-          out[i] = _access(self.y, sample)
+          out[i] = _access(self._y, sample)
         else:
           if self.n_classes > 1:
             if len(self.output_shape) == 2:
-              out.itemset((i, int(_access(self.y, sample))), 1.0)
+              out.itemset((i, int(_access(self._y, sample))), 1.0)
             else:
-              for idx, value in enumerate(_access(self.y, sample)):
+              for idx, value in enumerate(_access(self._y, sample)):
                 out.itemset(tuple([i, idx, value]), 1.0)
           else:
-            out[i] = _access(self.y, sample)
+            out[i] = _access(self._y, sample)
       feed_dict[self._output_placeholder.name] = out
 
       return feed_dict
@@ -420,32 +450,28 @@ class StreamingDataFeeder(DataFeeder):
     """
     # pylint: disable=invalid-name,super-init-not-called
     x_first_el = six.next(x)
-    self.x = itertools.chain([x_first_el], x)
+    self._x = itertools.chain([x_first_el], x)
     if y is not None:
       y_first_el = six.next(y)
-      self.y = itertools.chain([y_first_el], y)
+      self._y = itertools.chain([y_first_el], y)
     else:
       y_first_el = None
-      self.y = None
+      self._y = None
     self.n_classes = n_classes
     self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
         [1] + list(x_first_el.shape),
         [1] + list(y_first_el.shape) if y is not None else None,
         n_classes,
         batch_size)
-    self.input_dtype = x_first_el.dtype
-    # Convert float64 to float32, as all the parameters in the model are
-    # floats32 and there is a lot of benefits in using it in NNs.
-    if self.input_dtype == np.float64:
-      self.input_dtype = np.float32
+    self._input_dtype = _check_dtype(x_first_el.dtype)
     # Output types are floats, due to both softmaxes and regression req.
     if n_classes is not None and n_classes > 0:
-      self.output_dtype = np.float32
+      self._output_dtype = np.float32
     elif y is not None:
       if isinstance(y_first_el, list) or isinstance(y_first_el, np.ndarray):
-        self.output_dtype = np.dtype(type(y_first_el[0]))
+        self._output_dtype = _check_dtype(np.dtype(type(y_first_el[0])))
       else:
-        self.output_dtype = np.dtype(type(y_first_el))
+        self._output_dtype = _check_dtype(np.dtype(type(y_first_el)))
 
   def get_feed_params(self):
     """Function returns a dict with data feed params while training.
@@ -472,22 +498,22 @@ class StreamingDataFeeder(DataFeeder):
       """
       if self.stopped:
         raise StopIteration
-      inp = np.zeros(self.input_shape, dtype=self.input_dtype)
-      if self.y is not None:
-        out = np.zeros(self.output_shape, dtype=self.output_dtype)
+      inp = np.zeros(self.input_shape, dtype=self._input_dtype)
+      if self._y is not None:
+        out = np.zeros(self.output_shape, dtype=self._output_dtype)
       for i in xrange(self._batch_size):
         # Add handling when queue ends.
         try:
-          inp[i, :] = six.next(self.x)
+          inp[i, :] = six.next(self._x)
         except StopIteration:
           self.stopped = True
           inp = inp[:i, :]
-          if self.y is not None:
+          if self._y is not None:
             out = out[:i]
           break
 
-        if self.y is not None:
-          y = six.next(self.y)
+        if self._y is not None:
+          y = six.next(self._y)
           if self.n_classes is not None and self.n_classes > 1:
             if len(self.output_shape) == 2:
               out.itemset((i, y), 1.0)
@@ -496,7 +522,7 @@ class StreamingDataFeeder(DataFeeder):
                 out.itemset(tuple([i, idx, value]), 1.0)
           else:
             out[i] = y
-      if self.y is None:
+      if self._y is None:
         return {self._input_placeholder.name: inp}
       return {self._input_placeholder.name: inp,
               self._output_placeholder.name: out}
@@ -511,6 +537,7 @@ class DaskDataFeeder(object):
   into them. DaskDataFeeder will remove requirement to have full dataset in the
   memory and still do random seeks for sampling of batches.
   """
+
   def __init__(self, x, y, n_classes, batch_size, shuffle=True,
                random_state=None, epochs=None):
     """Initializes a DaskDataFeeder instance.
@@ -521,8 +548,10 @@ class DaskDataFeeder(object):
         regression values.
       n_classes: indicator of how many classes the target has.
       batch_size: Mini batch size to accumulate.
+      shuffle: Whether to shuffle the inputs.
       random_state: random state for RNG. Note that it will mutate so use a
         int value for this if you want consistent sized batches.
+      epochs: Number of epochs to run.
 
     Attributes:
       x: input features.
@@ -537,35 +566,33 @@ class DaskDataFeeder(object):
     # pylint: disable=invalid-name,super-init-not-called
     import dask.dataframe as dd  # pylint: disable=g-import-not-at-top
     # TODO(terrytangyuan): check x and y dtypes in dask_io like pandas
-    self.x = x
-    self.y = y
+    self._x = x
+    self._y = y
     # save column names
-    self.x_columns = list(x.columns)
+    self._x_columns = list(x.columns)
     if isinstance(y.columns[0], str):
-      self.y_columns = list(y.columns)
+      self._y_columns = list(y.columns)
     else:
       # deal with cases where two DFs have overlapped default numeric colnames
-      self.y_columns = len(self.x_columns) + 1
-      self.y = self.y.rename(columns={y.columns[0]: self.y_columns})
+      self._y_columns = len(self._x_columns) + 1
+      self._y = self._y.rename(columns={y.columns[0]: self._y_columns})
 
     # TODO(terrytangyuan): deal with unsupervised cases
     # combine into a data frame
-    self.df = dd.multi.concat([self.x, self.y], axis=1)
+    self.df = dd.multi.concat([self._x, self._y], axis=1)
     self.n_classes = n_classes
 
     x_count = x.count().compute()[0]
-    x_shape = (x_count, len(self.x.columns))
-    y_shape = (x_count, len(self.y.columns))
+    x_shape = (x_count, len(self._x.columns))
+    y_shape = (x_count, len(self._y.columns))
     # TODO(terrytangyuan): Add support for shuffle and epochs.
-    self.shuffle = shuffle
+    self._shuffle = shuffle
     self.epochs = epochs
     self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
         x_shape, y_shape, n_classes, batch_size)
     self.sample_fraction = self._batch_size / float(x_count)
-    # TODO(ptucker,ipolosukhin): Remove this?
-    # TODO(ipolosukhin): remove or restore.
-    # self.x.dtypes[0], self.y.dtypes[self.y_columns]
-    self.input_dtype, self.output_dtype = np.float32, np.float32
+    self._input_dtype = _check_dtype(self._x.dtypes[0])
+    self._output_dtype = _check_dtype(self._y.dtypes[self._y_columns])
     if random_state is None:
       self.random_state = 66
     else:
@@ -597,17 +624,17 @@ class DaskDataFeeder(object):
       sample = self.df.random_split(
           [self.sample_fraction, 1 - self.sample_fraction],
           random_state=self.random_state)
-      inp = extract_pandas_matrix(sample[0][self.x_columns].compute()).tolist()
-      out = extract_pandas_matrix(sample[0][self.y_columns].compute())
+      inp = extract_pandas_matrix(sample[0][self._x_columns].compute()).tolist()
+      out = extract_pandas_matrix(sample[0][self._y_columns].compute())
       # convert to correct dtype
-      inp = np.array(inp, dtype=self.input_dtype)
+      inp = np.array(inp, dtype=self._input_dtype)
       # one-hot encode out for each class for cross entropy loss
       if HAS_PANDAS:
         import pandas as pd  # pylint: disable=g-import-not-at-top
         if not isinstance(out, pd.Series):
           out = out.flatten()
-      out_max = self.y.max().compute().values[0]
-      encoded_out = np.zeros((out.size, out_max + 1), dtype=self.output_dtype)
+      out_max = self._y.max().compute().values[0]
+      encoded_out = np.zeros((out.size, out_max + 1), dtype=self._output_dtype)
       encoded_out[np.arange(out.size), out] = 1
       return {input_placeholder.name: inp,
               output_placeholder.name: encoded_out}
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
index 1709e428fc2..bf5e62cb4c0 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
@@ -20,12 +20,17 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import input as input_ops
-
+from tensorflow.python.training import queue_runner
 
 # Default name for key in the feature dict.
 KEY_FEATURE_NAME = '__key__'
@@ -219,11 +224,18 @@ def read_keyed_batch_examples(
     return queued_examples_with_keys
 
 
-def read_keyed_batch_features(
-    file_pattern, batch_size, features, reader,
-    randomize_input=True, num_epochs=None,
-    queue_capacity=10000, reader_num_threads=1,
-    parser_num_threads=1, name=None):
+def read_keyed_batch_features(file_pattern,
+                              batch_size,
+                              features,
+                              reader,
+                              randomize_input=True,
+                              num_epochs=None,
+                              queue_capacity=10000,
+                              reader_num_threads=1,
+                              feature_queue_capacity=100,
+                              num_queue_runners=2,
+                              parser_num_threads=None,
+                              name=None):
   """Adds operations to read, queue, batch and parse `Example` protos.
 
   Given file pattern (or list of files), will setup a queue for file names,
@@ -251,7 +263,12 @@ def read_keyed_batch_features(
       tf.initialize_local_variables() as shown in the tests.
     queue_capacity: Capacity for input queue.
     reader_num_threads: The number of threads to read examples.
-    parser_num_threads: The number of threads to parse examples.
+    feature_queue_capacity: Capacity of the parsed features queue.
+    num_queue_runners: Number of queue runners to start for the feature queue,
+      Adding multiple queue runners for the parsed example queue helps maintain
+      a full queue when the subsequent computations overall are cheaper than
+      parsing.
+    parser_num_threads: (Deprecated) The number of threads to parse examples.
     name: Name of resulting op.
 
   Returns:
@@ -261,6 +278,11 @@ def read_keyed_batch_features(
   Raises:
     ValueError: for invalid inputs.
   """
+
+  if parser_num_threads:
+    # TODO(sibyl-Aix6ihai): Remove on Sept 3 2016.
+    logging.warning('parser_num_threads is deprecated, it will be removed on'
+                    'Sept 3 2016')
   with ops.op_scope([file_pattern], name, 'read_batch_features') as scope:
     keys, examples = read_keyed_batch_examples(
         file_pattern, batch_size, reader, randomize_input=randomize_input,
@@ -268,24 +290,66 @@ def read_keyed_batch_features(
         num_threads=reader_num_threads, read_batch_size=batch_size,
         name=scope)
 
-    if parser_num_threads == 1:
-      # Avoid queue overhead for single thread
-      return keys, parsing_ops.parse_example(examples, features)
+    # Parse the example.
+    feature_map = parsing_ops.parse_example(examples, features)
 
-    # Parse features into tensors in many threads and put on the queue.
-    features_list = []
-    for _ in range(parser_num_threads):
-      feature_dict = parsing_ops.parse_example(examples, features)
-      feature_dict[KEY_FEATURE_NAME] = keys
-      features_list.append(feature_dict)
-    queued_features = input_ops.batch_join(
-        features_list,
-        batch_size=batch_size,
-        capacity=queue_capacity,
-        enqueue_many=True,
-        name='parse_example_batch_join')
-    queued_keys = queued_features.pop(KEY_FEATURE_NAME)
-    return queued_keys, queued_features
+    # Lets also add preprocessed tensors into the queue types for each item of
+    # the queue.
+    tensors_to_enqueue = []
+    # Each entry contains the key, and a boolean which indicates whether the
+    # tensor was a sparse tensor.
+    tensors_mapping = []
+    # TODO(sibyl-Aix6ihai): Most of the functionality here is about pushing sparse
+    # tensors into a queue. This could be taken care in somewhere else so others
+    # can reuse it. Also, QueueBase maybe extended to handle sparse tensors
+    # directly.
+    for key, tensor in feature_map.iteritems():
+      if isinstance(tensor, ops.SparseTensor):
+        tensors_mapping.append((key, True))
+        tensors_to_enqueue.extend([tensor.indices, tensor.values, tensor.shape])
+      else:
+        tensors_mapping.append((key, False))
+        tensors_to_enqueue.append(tensor)
+    tensors_to_enqueue.append(keys)
+
+    queue_dtypes = [x.dtype for x in tensors_to_enqueue]
+    input_queue = data_flow_ops.FIFOQueue(feature_queue_capacity, queue_dtypes)
+
+    # Add a summary op to debug if our feature queue is full or not.
+    logging_ops.scalar_summary('queue/parsed_features/%s/fraction_of_%d_full' %
+                               (input_queue.name, feature_queue_capacity),
+                               math_ops.cast(input_queue.size(), dtypes.float32)
+                               * (1. / feature_queue_capacity))
+
+    # Add multiple queue runners so that the queue is always full. Adding more
+    # than two queue-runners may hog the cpu on the worker to fill up the queue.
+    for _ in range(num_queue_runners):
+      queue_runner.add_queue_runner(
+          queue_runner.QueueRunner(input_queue, [input_queue.enqueue(
+              tensors_to_enqueue)]))
+
+    dequeued_tensors = input_queue.dequeue()
+
+    # Reset shapes on dequeued tensors.
+    for i in range(len(tensors_to_enqueue)):
+      dequeued_tensors[i].set_shape(tensors_to_enqueue[i].get_shape())
+
+    # Recreate feature mapping according to the original dictionary.
+    dequeued_feature_map = {}
+    index = 0
+    for key, is_sparse_tensor in tensors_mapping:
+      if is_sparse_tensor:
+        # Three tensors are (indices, values, shape).
+        dequeued_feature_map[key] = ops.SparseTensor(
+            dequeued_tensors[index], dequeued_tensors[index + 1],
+            dequeued_tensors[index + 2])
+        index += 3
+      else:
+        dequeued_feature_map[key] = dequeued_tensors[index]
+        index += 1
+    dequeued_keys = dequeued_tensors[-1]
+
+    return dequeued_keys, dequeued_feature_map
 
 
 def read_batch_features(file_pattern, batch_size, features, reader,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
index f11f0a841f1..d15ef13d7eb 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
@@ -124,18 +124,18 @@ class GraphIOTest(tf.test.TestCase):
           _VALID_FILE_PATTERN, batch_size, features, randomize_input=False,
           queue_capacity=queue_capacity, reader_num_threads=2,
           parser_num_threads=2, name=name)
-      self.assertEqual("%s/parse_example_batch_join:1" % name,
+      self.assertEqual("%s/fifo_queue_1_Dequeue:0" % name,
                        features["feature"].name)
       file_name_queue_name = "%s/file_name_queue" % name
       file_names_name = "%s/input" % file_name_queue_name
       example_queue_name = "%s/fifo_queue" % name
-      parse_example_queue_name = "%s/parse_example_batch_join" % name
+      parse_example_queue_name = "%s/fifo_queue" % name
       op_nodes = test_util.assert_ops_in_graph({
           file_names_name: "Const",
           file_name_queue_name: "FIFOQueue",
           "%s/read/TFRecordReader" % name: "TFRecordReader",
           example_queue_name: "FIFOQueue",
-          parse_example_queue_name: "QueueDequeueMany",
+          parse_example_queue_name: "FIFOQueue",
           name: "QueueDequeueMany"
       }, g)
       self.assertAllEqual(_FILE_NAMES, sess.run(["%s:0" % file_names_name])[0])
diff --git a/tensorflow/contrib/learn/python/learn/models.py b/tensorflow/contrib/learn/python/learn/models.py
index d48fa20fb4a..3d41e4907b3 100644
--- a/tensorflow/contrib/learn/python/learn/models.py
+++ b/tensorflow/contrib/learn/python/learn/models.py
@@ -19,10 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.contrib.learn.python.learn.ops import autoencoder_ops
 from tensorflow.contrib.learn.python.learn.ops import dnn_ops
 from tensorflow.contrib.learn.python.learn.ops import losses_ops
-from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops as array_ops_
@@ -81,6 +81,7 @@ def linear_regression(x, y, init_mean=None, init_stddev=1.0):
   with vs.variable_scope('linear_regression'):
     logging_ops.histogram_summary('linear_regression.x', x)
     logging_ops.histogram_summary('linear_regression.y', y)
+    dtype = x.dtype.base_dtype
     y_shape = y.get_shape()
     if len(y_shape) == 1:
       output_shape = 1
@@ -88,15 +89,18 @@ def linear_regression(x, y, init_mean=None, init_stddev=1.0):
       output_shape = y_shape[1]
     # Set up the requested initialization.
     if init_mean is None:
-      weights = vs.get_variable('weights', [x.get_shape()[1], output_shape])
-      bias = vs.get_variable('bias', [output_shape])
+      weights = vs.get_variable(
+          'weights', [x.get_shape()[1], output_shape], dtype=dtype)
+      bias = vs.get_variable('bias', [output_shape], dtype=dtype)
     else:
       weights = vs.get_variable('weights', [x.get_shape()[1], output_shape],
                                 initializer=init_ops.random_normal_initializer(
-                                    init_mean, init_stddev))
+                                    init_mean, init_stddev, dtype=dtype),
+                                dtype=dtype)
       bias = vs.get_variable('bias', [output_shape],
                              initializer=init_ops.random_normal_initializer(
-                                 init_mean, init_stddev))
+                                 init_mean, init_stddev, dtype=dtype),
+                             dtype=dtype)
     logging_ops.histogram_summary('linear_regression.weights', weights)
     logging_ops.histogram_summary('linear_regression.bias', bias)
     return losses_ops.mean_squared_error_regressor(x, y, weights, bias)
@@ -135,19 +139,22 @@ def logistic_regression(x,
   with vs.variable_scope('logistic_regression'):
     logging_ops.histogram_summary('%s.x' % vs.get_variable_scope().name, x)
     logging_ops.histogram_summary('%s.y' % vs.get_variable_scope().name, y)
+    dtype = x.dtype.base_dtype
     # Set up the requested initialization.
     if init_mean is None:
-      weights = vs.get_variable('weights',
-                                [x.get_shape()[1], y.get_shape()[-1]])
-      bias = vs.get_variable('bias', [y.get_shape()[-1]])
+      weights = vs.get_variable(
+          'weights', [x.get_shape()[1], y.get_shape()[-1]], dtype=dtype)
+      bias = vs.get_variable('bias', [y.get_shape()[-1]], dtype=dtype)
     else:
       weights = vs.get_variable('weights',
                                 [x.get_shape()[1], y.get_shape()[-1]],
                                 initializer=init_ops.random_normal_initializer(
-                                    init_mean, init_stddev))
+                                    init_mean, init_stddev, dtype=dtype),
+                                dtype=dtype)
       bias = vs.get_variable('bias', [y.get_shape()[-1]],
                              initializer=init_ops.random_normal_initializer(
-                                 init_mean, init_stddev))
+                                 init_mean, init_stddev, dtype=dtype),
+                             dtype=dtype)
     logging_ops.histogram_summary('%s.weights' % vs.get_variable_scope().name,
                                   weights)
     logging_ops.histogram_summary('%s.bias' % vs.get_variable_scope().name,
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index dca39386bed..ddf97437423 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -535,8 +535,12 @@ class LoggingTrainable(EveryN):
 class SummarySaver(EveryN):
   """Saves summaries every N steps."""
 
-  def __init__(self, summary_op, save_steps=100, output_dir=None,
-               summary_writer=None):
+  def __init__(self,
+               summary_op,
+               save_steps=100,
+               output_dir=None,
+               summary_writer=None,
+               scaffold=None):
     """Initializes a `SummarySaver` monitor.
 
     Args:
@@ -548,6 +552,7 @@ class SummarySaver(EveryN):
           if no `summary_writer` is supplied.
       summary_writer: `SummaryWriter`. If `None` and an `output_dir` was passed,
           one will be created accordingly.
+      scaffold: `Scaffold` to get summary_op if it's not provided.
     """
     # TODO(ipolosukhin): Implement every N seconds.
     super(SummarySaver, self).__init__(every_n_steps=save_steps)
@@ -555,6 +560,7 @@ class SummarySaver(EveryN):
     self._summary_writer = summary_writer
     if summary_writer is None and output_dir:
       self._summary_writer = summary_io.SummaryWriter(output_dir)
+    self._scaffold = scaffold
     # TODO(mdan): Throw an error if output_dir and summary_writer are None.
 
   def set_estimator(self, estimator):
@@ -565,15 +571,18 @@ class SummarySaver(EveryN):
 
   def every_n_step_begin(self, step):
     super(SummarySaver, self).every_n_step_begin(step)
+    if self._summary_op is None and self._scaffold is not None:
+      self._summary_op = self._scaffold.summary_op
     if self._summary_op is not None:
       return [self._summary_op]
     return []
 
   def every_n_step_end(self, step, outputs):
     super(SummarySaver, self).every_n_step_end(step, outputs)
-    summary_strs = _extract_output(outputs, self._summary_op)
-    if self._summary_writer and self._summary_op is not None:
-      self._summary_writer.add_summary(summary_strs, step)
+    if self._summary_op is not None:
+      summary_strs = _extract_output(outputs, self._summary_op)
+      if self._summary_writer:
+        self._summary_writer.add_summary(summary_strs, step)
     return False
 
   def end(self, session=None):
@@ -923,37 +932,89 @@ class ExportMonitor(EveryN):
                             default_batch_size=self._default_batch_size)
 
 
-class CheckpointSaver(EveryN):
+class CheckpointSaver(BaseMonitor):
   """Saves checkpoints every N steps."""
 
-  def __init__(self, every_n_steps, saver, checkpoint_dir,
+  def __init__(self,
+               checkpoint_dir,
+               save_secs=None,
+               save_steps=None,
+               saver=None,
                checkpoint_basename="model.ckpt",
-               first_n_steps=-1):
+               scaffold=None):
     """Initialize CheckpointSaver monitor.
 
     Args:
-      every_n_steps: `int`, save every N steps.
-      saver: `Saver` object, used for saving.
       checkpoint_dir: `str`, base directory for the checkpoint files.
+      save_secs: `int`, save every N secs.
+      save_steps: `int`, save every N steps.
+      saver: `Saver` object, used for saving.
       checkpoint_basename: `str`, base name for the checkpoint files.
-      first_n_steps: `int`, if positive, save every step during the
-        first `first_n_steps` steps.
+      scaffold: `Scaffold`, use to get saver object.
+
+    Raises:
+      ValueError: If both `save_steps` and `save_secs` are not `None`.
+      ValueError: If both `save_steps` and `save_secs` are `None`.
     """
     logging.info("Create CheckpointSaver")
-    super(CheckpointSaver, self).__init__(every_n_steps=every_n_steps,
-                                          first_n_steps=first_n_steps)
+    super(CheckpointSaver, self).__init__()
     self._saver = saver
     self._summary_writer = SummaryWriterCache.get(checkpoint_dir)
     self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
+    self._scaffold = scaffold
+    self._save_secs = save_secs
+    self._save_steps = save_steps
+    self._last_saved_time = None
+    self._last_begin_step = None
+    self._last_saved_step = None
 
-  def every_n_post_step(self, step, session):
+    if save_steps is None and save_secs is None:
+      raise ValueError("Either save_steps or save_secs should be provided")
+    if (save_steps is not None) and (save_secs is not None):
+      raise ValueError("Can not provide both save_steps and save_secs.")
+
+  def begin(self, max_steps=None):
+    super(CheckpointSaver, self).begin(max_steps)
+    self._last_saved_time = None
+    self._last_begin_step = None
+    self._last_saved_step = None
+
+  def step_begin(self, step):
+    super(CheckpointSaver, self).step_begin(step)
+    self._last_begin_step = step
+
+  def post_step(self, step, session):
+    super(CheckpointSaver, self).post_step(step, session)
+    if self._last_saved_time is None:
+      self._save(step, session)
+
+    if self._save_steps is not None:
+      if step >= self._last_saved_step + self._save_steps:
+        self._save(step, session)
+
+    if self._save_secs is not None:
+      if time.time() >= self._last_saved_time + self._save_secs:
+        self._save(step, session)
+
+  def end(self, session=None):
+    super(CheckpointSaver, self).end(session)
+    self._save(self._last_begin_step, session)
+
+  def _save(self, step, session):
+    """Saves the latest checkpoint."""
+    if step == self._last_saved_step:
+      return
     logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
-    self._saver.save(session, self._save_path, global_step=step)
-    if self._summary_writer:
-      self._summary_writer.add_session_log(
-          SessionLog(status=SessionLog.CHECKPOINT,
-                     checkpoint_path=self._save_path),
-          step)
+    self._last_saved_time = time.time()
+    self._last_saved_step = step
+    if self._saver is None:
+      self._scaffold.saver.save(session, self._save_path, global_step=step)
+    else:
+      self._saver.save(session, self._save_path, global_step=step)
+    self._summary_writer.add_session_log(
+        SessionLog(
+            status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
+        step)
 
 
 class StepCounter(EveryN):
diff --git a/tensorflow/contrib/learn/python/learn/supervised_session.py b/tensorflow/contrib/learn/python/learn/supervised_session.py
index 768af3cc91f..07d100fefc7 100644
--- a/tensorflow/contrib/learn/python/learn/supervised_session.py
+++ b/tensorflow/contrib/learn/python/learn/supervised_session.py
@@ -119,47 +119,96 @@ class Scaffold(object):
       keep_checkpoint_max: Optional parameter to use to construct a saver if
         none is already there in the graph.
     """
-    if global_step_tensor is None:
-      global_step_tensor = contrib_variables.get_or_create_global_step()
-    self.global_step_tensor = global_step_tensor
-    if init_op is None:
-      init_op = Scaffold._get_or_default(
-          ops.GraphKeys.INIT_OP, variables.initialize_all_variables)
-    self.init_op = init_op
-    self.init_feed_dict = init_feed_dict
+
     # NOTE(touts): modifying the init function to be passed the scaffold is a
     # hack to make it easy to find the saver.  Is there a better way?
     if init_fn:
-      self.init_fn = lambda sess: init_fn(self, sess)
+      self._init_fn = lambda sess: init_fn(self, sess)
     else:
-      self.init_fn = None
-    if ready_op is None:
-      ready_op = Scaffold._get_or_default(
-          ops.GraphKeys.READY_OP, variables.report_uninitialized_variables)
-    self.ready_op = ready_op
-    if local_init_op is None:
-      local_init_op = Scaffold._get_or_default(
-          ops.GraphKeys.LOCAL_INIT_OP, Scaffold._default_local_init_op)
-    self.local_init_op = local_init_op
-    if summary_op is None:
-      summary_op = Scaffold._get_or_default(
-          ops.GraphKeys.SUMMARY_OP, logging_ops.merge_all_summaries)
-    self.summary_op = summary_op
+      self._init_fn = None
+
+    self._global_step_tensor = global_step_tensor
+    self._init_op = init_op
+    self._ready_op = ready_op
+    self._local_init_op = local_init_op
+    self._summary_op = summary_op
+    self._saver = saver
+    self._keep_checkpoint_max = keep_checkpoint_max
+    self._init_feed_dict = init_feed_dict
+
+  def finalize(self):
+    """Creates operations if needed and finalizes the graph."""
+    if self._global_step_tensor is None:
+      self._global_step_tensor = contrib_variables.get_or_create_global_step()
+    if self._init_op is None:
+      self._init_op = Scaffold._get_or_default(
+          'init_op', ops.GraphKeys.INIT_OP, variables.initialize_all_variables)
+    if self._ready_op is None:
+      self._ready_op = Scaffold._get_or_default(
+          'ready_op', ops.GraphKeys.READY_OP,
+          variables.report_uninitialized_variables)
+    if self._local_init_op is None:
+      self._local_init_op = Scaffold._get_or_default(
+          'local_init_op', ops.GraphKeys.LOCAL_INIT_OP,
+          Scaffold._default_local_init_op)
+    if self._summary_op is None:
+      self._summary_op = Scaffold._get_or_default(
+          'summary_op', ops.GraphKeys.SUMMARY_OP,
+          logging_ops.merge_all_summaries)
     # pylint: disable=g-long-lambda
-    if saver is None:
-      saver = Scaffold._get_or_default(
+    if self._saver is None:
+      self._saver = Scaffold._get_or_default(
+          'saver',
           ops.GraphKeys.SAVERS,
           lambda: training_saver.Saver(sharded=True,
-                                       max_to_keep=keep_checkpoint_max))
+                                       max_to_keep=self._keep_checkpoint_max))
     # pylint: enable=g-long-lambda
-    self.saver = saver
 
     ops.get_default_graph().finalize()
 
+  @property
+  def global_step_tensor(self):
+    return self._global_step_tensor
+
+  @property
+  def init_fn(self):
+    return self._init_fn
+
+  @property
+  def init_op(self):
+    return self._init_op
+
+  @property
+  def ready_op(self):
+    return self._ready_op
+
+  @property
+  def local_init_op(self):
+    return self._local_init_op
+
+  @property
+  def summary_op(self):
+    return self._summary_op
+
+  @property
+  def saver(self):
+    return self._saver
+
+  @property
+  def init_feed_dict(self):
+    return self._init_feed_dict
+
   @staticmethod
-  def _get_or_default(collection_key, default_constructor):
+  def _get_or_default(arg_name, collection_key, default_constructor):
+    """Get from cache or create a default operation."""
     elements = ops.get_collection(collection_key)
     if elements:
+      if len(elements) > 1:
+        raise RuntimeError('More than one item in the collection "%s". '
+                           'Please indicate which one to use by passing it to '
+                           'the tf.Scaffold constructor as:  '
+                           'tf.Scaffold(%s=item to use)', collection_key,
+                           arg_name)
       return elements[0]
     op = default_constructor()
     if op is not None:
@@ -202,9 +251,10 @@ class SupervisedSession(object):
     self._config = config
     self._monitors = monitors or []
     self._scaffold = scaffold or Scaffold()
-    # Finalize and write the graph.
-    self._graph.finalize()
+    for monitor in self._monitors:
+      monitor.begin(max_steps=None)
     # Create the session.
+    self._scaffold.finalize()
     self._session_manager = sm.SessionManager(
         local_init_op=self._scaffold.local_init_op,
         ready_op=self._scaffold.ready_op,
@@ -212,8 +262,6 @@ class SupervisedSession(object):
     self._sess = recoverable_session.RecoverableSession(self._create_session)
     # Call the begin() method of monitors.
     self._init_step = self._tf_sess.run(self._scaffold.global_step_tensor)
-    for monitor in self._monitors:
-      monitor.begin(max_steps=None)
     # Write the graph out, note: this uses self._init_step.
     self.write_graph()
 
diff --git a/tensorflow/contrib/learn/python/learn/tests/coordinated_session_test.py b/tensorflow/contrib/learn/python/learn/tests/coordinated_session_test.py
index aad9b71d453..72ff75fbfde 100644
--- a/tensorflow/contrib/learn/python/learn/tests/coordinated_session_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/coordinated_session_test.py
@@ -76,9 +76,8 @@ class CoordinatedSessionTest(tf.test.TestCase):
       self.assertFalse(coord_sess.should_stop())
       self.assertEqual(0, coord_sess.run(c))
       self.assertEqual(1, coord_sess.run(v, feed_dict={c: 1}))
-      with self.assertRaisesRegexp(tf.errors.InvalidArgumentError,
-                                   'both fed and fetched'):
-        coord_sess.run(c, feed_dict={c: 2})
+      with self.assertRaisesRegexp(TypeError, 'None has invalid type'):
+        coord_sess.run([None], feed_dict={c: 2})
       self.assertTrue(coord.should_stop())
       self.assertTrue(coord_sess.should_stop())
 
@@ -101,9 +100,8 @@ class CoordinatedSessionTest(tf.test.TestCase):
       self.assertEqual(1, coord_sess.run(v, feed_dict={c: 1}))
       for t in threads:
         self.assertTrue(t.is_alive())
-      with self.assertRaisesRegexp(tf.errors.InvalidArgumentError,
-                                   'both fed and fetched'):
-        coord_sess.run(c, feed_dict={c: 2})
+      with self.assertRaisesRegexp(TypeError, 'None has invalid type'):
+        coord_sess.run([None], feed_dict={c: 2})
       for t in threads:
         self.assertFalse(t.is_alive())
       self.assertTrue(coord.should_stop())
diff --git a/tensorflow/contrib/learn/python/learn/tests/data_feeder_test.py b/tensorflow/contrib/learn/python/learn/tests/data_feeder_test.py
index 89e4186e253..fe675e31229 100644
--- a/tensorflow/contrib/learn/python/learn/tests/data_feeder_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/data_feeder_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
 import tensorflow as tf
 # pylint: disable=wildcard-import
@@ -31,6 +32,68 @@ class DataFeederTest(tf.test.TestCase):
   # pylint: disable=undefined-variable
   """Tests for `DataFeeder`."""
 
+  def _assert_raises(self, input_data):
+    with self.assertRaisesRegexp(TypeError, 'annot convert'):
+      data_feeder.DataFeeder(input_data, None, n_classes=0, batch_size=1)
+
+  def test_input_uint32(self):
+    self._assert_raises(np.matrix([[1, 2], [3, 4]], dtype=np.uint32))
+
+  def test_input_uint64(self):
+    self._assert_raises(np.matrix([[1, 2], [3, 4]], dtype=np.uint64))
+
+  def _assert_dtype(self, expected_np_dtype, expected_tf_dtype, input_data):
+    feeder = data_feeder.DataFeeder(input_data, None, n_classes=0, batch_size=1)
+    self.assertEqual(expected_np_dtype, feeder.input_dtype)
+    with tf.Graph().as_default() as g, self.test_session(g):
+      inp, _ = feeder.input_builder()
+      self.assertEqual(expected_tf_dtype, inp.dtype)
+
+  def test_input_int8(self):
+    self._assert_dtype(
+        np.int8, tf.int8, np.matrix([[1, 2], [3, 4]], dtype=np.int8))
+
+  def test_input_int16(self):
+    self._assert_dtype(
+        np.int16, tf.int16, np.matrix([[1, 2], [3, 4]], dtype=np.int16))
+
+  def test_input_int32(self):
+    self._assert_dtype(
+        np.int32, tf.int32, np.matrix([[1, 2], [3, 4]], dtype=np.int32))
+
+  def test_input_int64(self):
+    self._assert_dtype(
+        np.int64, tf.int64, np.matrix([[1, 2], [3, 4]], dtype=np.int64))
+
+  def test_input_uint8(self):
+    self._assert_dtype(
+        np.uint8, tf.uint8, np.matrix([[1, 2], [3, 4]], dtype=np.uint8))
+
+  def test_input_uint16(self):
+    self._assert_dtype(
+        np.uint16, tf.uint16, np.matrix([[1, 2], [3, 4]], dtype=np.uint16))
+
+  def test_input_float16(self):
+    self._assert_dtype(
+        np.float16, tf.float16, np.matrix([[1, 2], [3, 4]], dtype=np.float16))
+
+  def test_input_float32(self):
+    self._assert_dtype(
+        np.float32, tf.float32, np.matrix([[1, 2], [3, 4]], dtype=np.float32))
+
+  def test_input_float64(self):
+    self._assert_dtype(
+        np.float64, tf.float64, np.matrix([[1, 2], [3, 4]], dtype=np.float64))
+
+  def test_input_bool(self):
+    self._assert_dtype(
+        np.bool, tf.bool,
+        np.array([[False for _ in xrange(2)] for _ in xrange(2)]))
+
+  def test_input_string(self):
+    input_data = np.array([['str%d' % i for i in xrange(2)] for _ in xrange(2)])
+    self._assert_dtype(input_data.dtype, tf.string, input_data)
+
   def test_unsupervised(self):
     data = np.matrix([[1, 2], [2, 3], [3, 4]])
     feeder = data_feeder.DataFeeder(data, None, n_classes=0, batch_size=1)
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/arithmetic_transform_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/arithmetic_transform_test.py
index 2266caeb2f0..045f84580f1 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/arithmetic_transform_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/arithmetic_transform_test.py
@@ -53,5 +53,26 @@ class SumTestCase(tf.test.TestCase):
     np.testing.assert_array_equal(expected_sum, actual_sum)
 
 
+class DifferenceTestCase(tf.test.TestCase):
+  """Test class for `Difference` transform."""
+
+  def testDifference(self):
+    if not HAS_PANDAS:
+      return
+    num_rows = 100
+
+    pandas_df = pd.DataFrame({"a": np.arange(num_rows),
+                              "b": np.arange(num_rows, 2 * num_rows)})
+
+    frame = df.TensorFlowDataFrame.from_pandas(
+        pandas_df, shuffle=False, batch_size=num_rows)
+
+    frame["a-b"] = frame["a"] - frame["b"]
+
+    expected_diff = pandas_df["a"] - pandas_df["b"]
+    actual_diff = frame.run_once()["a-b"]
+    np.testing.assert_array_equal(expected_diff, actual_diff)
+
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/compare_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/binary_transform_test.py
similarity index 80%
rename from tensorflow/contrib/learn/python/learn/tests/dataframe/compare_test.py
rename to tensorflow/contrib/learn/python/learn/tests/dataframe/binary_transform_test.py
index 51c464f73d5..9697feff25f 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/compare_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/binary_transform_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Tests for comparison transforms."""
+"""Tests for binary transforms."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,14 +23,15 @@ import numpy as np
 import tensorflow as tf
 
 from tensorflow.contrib.learn.python.learn.dataframe import tensorflow_dataframe as df
-from tensorflow.contrib.learn.python.learn.dataframe.transforms.compare import COMPARISON_TRANSFORMS
+from tensorflow.contrib.learn.python.learn.dataframe.transforms.binary_transforms import BINARY_TRANSFORMS
 
 NUMPY_ARRAY_SIZE = 100
-SCALAR = 50
+SCALAR = 50.0
+TEST_NAME_PREFIX = "testBinaryOp_"
 
 
-class CompareTestCase(tf.test.TestCase):
-  """Test class for comparison transforms."""
+class BinaryTransformTestCase(tf.test.TestCase):
+  """Test class for binary transforms."""
 
   @classmethod
   def add_test_case(cls, fn_name, op):
@@ -69,10 +70,15 @@ class CompareTestCase(tf.test.TestCase):
       coord.join(threads)
       np.testing.assert_almost_equal(expected_series, actual_series)
       np.testing.assert_almost_equal(expected_scalar, actual_scalar)
-    setattr(cls, "test{}".format(op.__name__), _test)
+    setattr(cls, "{}{}".format(TEST_NAME_PREFIX, op.__name__), _test)
 
-for ct in COMPARISON_TRANSFORMS:
-  CompareTestCase.add_test_case(*ct)
+for bt in BINARY_TRANSFORMS:
+  BinaryTransformTestCase.add_test_case(*bt)
+
+# Check that the number of test methods matches the number of binary transforms.
+test_methods = [test for test in dir(BinaryTransformTestCase)
+                if test.startswith(TEST_NAME_PREFIX)]
+assert len(test_methods) == len(BINARY_TRANSFORMS)
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/boolean_mask_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/boolean_mask_test.py
index b3af36b52cf..1e3a069b6da 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/boolean_mask_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/boolean_mask_test.py
@@ -1,4 +1,4 @@
-# Copyright 2016 Google Inc. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/estimator_utils_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/estimator_utils_test.py
index 9fc1360ca32..0aeecc50158 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/estimator_utils_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/estimator_utils_test.py
@@ -1,4 +1,3 @@
-# pylint: disable=g-bad-file-header
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
index 14e283cb791..d7e2fe684b8 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
@@ -208,12 +208,17 @@ class TensorFlowDataFrameTestCase(tf.test.TestCase):
     tensorflow_df = df.TensorFlowDataFrame.from_csv(
         [data_path],
         batch_size=batch_size,
-        num_epochs=num_epochs,
         shuffle=False,
         default_values=default_values)
-    actual_num_batches = len(list(tensorflow_df.run()))
+    result_batches = list(tensorflow_df.run(num_epochs=num_epochs))
+    actual_num_batches = len(result_batches)
     self.assertEqual(expected_num_batches, actual_num_batches)
 
+    # TODO(soergel): figure out how to dequeue the final small batch
+    expected_rows = 1696  # num_epochs * 100
+    actual_rows = sum([len(x["int"]) for x in result_batches])
+    self.assertEqual(expected_rows, actual_rows)
+
   def testFromCSVWithFeatureSpec(self):
     if not HAS_PANDAS:
       return
@@ -297,6 +302,53 @@ class TensorFlowDataFrameTestCase(tf.test.TestCase):
         expected_value = expected_row[ind[1]]
         np.testing.assert_array_equal(expected_value, val)
 
+  def testSplitString(self):
+    batch_size = 8
+    num_epochs = 17
+    expected_num_batches = (num_epochs * 100) // batch_size
+
+    data_path = _make_test_csv()
+    default_values = [0, 0.0, 0, ""]
+
+    tensorflow_df = df.TensorFlowDataFrame.from_csv(
+        [data_path],
+        batch_size=batch_size,
+        shuffle=False,
+        default_values=default_values)
+
+    a, b = tensorflow_df.split("string", 0.7)  # no rebatching
+
+    total_result_batches = list(tensorflow_df.run(num_epochs=num_epochs))
+    a_result_batches = list(a.run(num_epochs=num_epochs))
+    b_result_batches = list(b.run(num_epochs=num_epochs))
+
+    self.assertEqual(expected_num_batches, len(total_result_batches))
+    self.assertEqual(expected_num_batches, len(a_result_batches))
+    self.assertEqual(expected_num_batches, len(b_result_batches))
+
+    total_rows = sum([len(x["int"]) for x in total_result_batches])
+    a_total_rows = sum([len(x["int"]) for x in a_result_batches])
+    b_total_rows = sum([len(x["int"]) for x in b_result_batches])
+
+    print("Split rows: %s => %s, %s" % (total_rows, a_total_rows, b_total_rows))
+
+    # TODO(soergel): figure out how to dequeue the final small batch
+    expected_total_rows = 1696  # (num_epochs * 100)
+
+    self.assertEqual(expected_total_rows, total_rows)
+    self.assertEqual(1087, a_total_rows)  # stochastic but deterministic
+    # self.assertEqual(int(total_rows * 0.7), a_total_rows)
+    self.assertEqual(609, b_total_rows)  # stochastic but deterministic
+    # self.assertEqual(int(total_rows * 0.3), b_total_rows)
+
+    # The strings used for hashing were all unique in the original data, but
+    # we ran 17 epochs, so each one should appear 17 times.  Each copy should
+    # be hashed into the same partition, so there should be no overlap of the
+    # keys.
+    a_strings = set([s for x in a_result_batches for s in x["string"]])
+    b_strings = set([s for x in b_result_batches for s in x["string"]])
+    self.assertEqual(frozenset(), a_strings & b_strings)
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/unary_transform_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/unary_transform_test.py
index 05b625ee05d..0c317966af3 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/unary_transform_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/unary_transform_test.py
@@ -1,4 +1,4 @@
-# Copyright 2016 Google Inc. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,7 +34,12 @@ class UnaryTestCase(tf.test.TestCase):
   @classmethod
   def add_test_case(cls, name, op, np_dtype=float):
     def _test(self):
-      arr = np.arange(NUMPY_ARRAY_SIZE, dtype=np_dtype)
+      if np_dtype == bool:
+        arr = np.array([True] * int(NUMPY_ARRAY_SIZE/2) +
+                       [False] * int(NUMPY_ARRAY_SIZE/2))
+        np.random.shuffle(arr)
+      else:
+        arr = np.arange(NUMPY_ARRAY_SIZE, dtype=np_dtype)
       frame = df.TensorFlowDataFrame.from_numpy(arr,
                                                 batch_size=NUMPY_ARRAY_SIZE,
                                                 shuffle=False)
diff --git a/tensorflow/contrib/learn/python/learn/tests/experiment_test.py b/tensorflow/contrib/learn/python/learn/tests/experiment_test.py
index d44ace1be4a..0ccb7b03ed7 100644
--- a/tensorflow/contrib/learn/python/learn/tests/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/experiment_test.py
@@ -23,7 +23,7 @@ import tensorflow as tf
 from tensorflow.contrib.learn.python.learn import runner_flags  # pylint: disable=unused-import
 
 
-class TestEstimator(object):
+class TestEstimator(tf.contrib.learn.Evaluable, tf.contrib.learn.Trainable):
 
   def __init__(self):
     self.eval_count = 0
diff --git a/tensorflow/contrib/learn/python/learn/tests/graph_actions_test.py b/tensorflow/contrib/learn/python/learn/tests/graph_actions_test.py
index 14a0c2c58ea..1acee3c4a32 100644
--- a/tensorflow/contrib/learn/python/learn/tests/graph_actions_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/graph_actions_test.py
@@ -207,9 +207,8 @@ class GraphActionsTest(tf.test.TestCase):
     with tf.Graph().as_default() as g, self.test_session(g):
       self._assert_ckpt(self._output_dir, False)
       in0, _, _ = self._build_inference_graph()
-      with self.assertRaisesRegexp(
-          tf.errors.InvalidArgumentError, 'both fed and fetched'):
-        learn.graph_actions.infer(None, {'a': in0}, feed_dict={in0: 4.0})
+      with self.assertRaisesRegexp(TypeError, 'Can not convert a NoneType'):
+        learn.graph_actions.infer(None, {'a': in0}, feed_dict={None: 4.0})
       self._assert_ckpt(self._output_dir, False)
 
   def test_infer_feed(self):
diff --git a/tensorflow/contrib/learn/python/learn/tests/load_csv_test.py b/tensorflow/contrib/learn/python/learn/tests/load_csv_test.py
new file mode 100644
index 00000000000..88e45abf4e0
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/tests/load_csv_test.py
@@ -0,0 +1,39 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.contrib.learn.python.learn import datasets
+
+
+class LoadCsvTest(tf.test.TestCase):
+  """Test load csv functions."""
+
+  def testIris(self):
+    iris = datasets.load_iris()
+    self.assertTupleEqual(iris.data.shape, (150, 4))
+    self.assertTupleEqual(iris.target.shape, (150,))
+
+  def testBoston(self):
+    boston = datasets.load_boston()
+    self.assertTupleEqual(boston.data.shape, (506, 13))
+    self.assertTupleEqual(boston.target.shape, (506,))
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/tests/monitors_test.py b/tensorflow/contrib/learn/python/learn/tests/monitors_test.py
index 29ec17400fc..574e9d13dd3 100644
--- a/tensorflow/contrib/learn/python/learn/tests/monitors_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/monitors_test.py
@@ -19,11 +19,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import shutil
+import tempfile
+import time
+
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
 from tensorflow.contrib import testing
 from tensorflow.contrib.learn.python import learn
+from tensorflow.contrib.learn.python.learn import supervised_session
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -328,5 +333,126 @@ class StopAtStepTest(tf.test.TestCase):
     self.assertTrue(m.step_end(15, None))
 
 
+class CheckpointSaverTest(tf.test.TestCase):
+
+  def setUp(self):
+    self.model_dir = tempfile.mkdtemp()
+    self.graph = tf.Graph()
+    with self.graph.as_default():
+      self.scaffold = supervised_session.Scaffold()
+      self.global_step = tf.contrib.framework.get_or_create_global_step()
+      self.train_op = tf.assign_add(self.global_step, 1)
+
+  def tearDown(self):
+    shutil.rmtree(self.model_dir, ignore_errors=True)
+
+  def _run(self, monitor, step, train_op, sess):
+    monitor.step_begin(step)
+    sess.run(train_op)
+    monitor.post_step(step, sess)
+
+  def test_raise_in_both_secs_and_steps(self):
+    with self.assertRaises(ValueError):
+      learn.monitors.CheckpointSaver(
+          self.model_dir, save_secs=10, save_steps=20)
+
+  def test_raise_in_none_secs_and_steps(self):
+    with self.assertRaises(ValueError):
+      learn.monitors.CheckpointSaver(self.model_dir)
+
+  def test_save_secs_saves_in_first_step(self):
+    with self.graph.as_default():
+      monitor = learn.monitors.CheckpointSaver(
+          self.model_dir, save_secs=2, scaffold=self.scaffold)
+      monitor.begin()
+      self.scaffold.finalize()
+      with tf.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        self._run(monitor, 1, self.train_op, sess)
+        self.assertEqual(1, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+
+  def test_save_secs_saves_periodically(self):
+    with self.graph.as_default():
+      monitor = learn.monitors.CheckpointSaver(
+          self.model_dir, save_secs=2, scaffold=self.scaffold)
+      monitor.begin()
+      self.scaffold.finalize()
+      with tf.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        self._run(monitor, 1, self.train_op, sess)
+        self._run(monitor, 2, self.train_op, sess)
+        # Not saved
+        self.assertEqual(1, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+        time.sleep(2.5)
+        self._run(monitor, 3, self.train_op, sess)
+        # saved
+        self.assertEqual(3, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+        self._run(monitor, 4, self.train_op, sess)
+        self._run(monitor, 5, self.train_op, sess)
+        # Not saved
+        self.assertEqual(3, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+        time.sleep(2.5)
+        self._run(monitor, 6, self.train_op, sess)
+        # saved
+        self.assertEqual(6, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+
+  def test_save_steps_saves_in_first_step(self):
+    with self.graph.as_default():
+      monitor = learn.monitors.CheckpointSaver(
+          self.model_dir, save_steps=2, scaffold=self.scaffold)
+      monitor.begin()
+      self.scaffold.finalize()
+      with tf.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        self._run(monitor, 1, self.train_op, sess)
+        self.assertEqual(1, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+
+  def test_save_steps_saves_periodically(self):
+    with self.graph.as_default():
+      monitor = learn.monitors.CheckpointSaver(
+          self.model_dir, save_steps=2, scaffold=self.scaffold)
+      monitor.begin()
+      self.scaffold.finalize()
+      with tf.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        self._run(monitor, 1, self.train_op, sess)
+        self._run(monitor, 2, self.train_op, sess)
+        # Not saved
+        self.assertEqual(1, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+        self._run(monitor, 3, self.train_op, sess)
+        # saved
+        self.assertEqual(3, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+        self._run(monitor, 4, self.train_op, sess)
+        # Not saved
+        self.assertEqual(3, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+        self._run(monitor, 5, self.train_op, sess)
+        # saved
+        self.assertEqual(5, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+
+  def test_save_saves_at_end(self):
+    with self.graph.as_default():
+      monitor = learn.monitors.CheckpointSaver(
+          self.model_dir, save_secs=2, scaffold=self.scaffold)
+      monitor.begin()
+      self.scaffold.finalize()
+      with tf.Session() as sess:
+        sess.run(self.scaffold.init_op)
+        self._run(monitor, 1, self.train_op, sess)
+        self._run(monitor, 2, self.train_op, sess)
+        monitor.end(sess)
+        self.assertEqual(2, tf.contrib.framework.load_variable(
+            self.model_dir, self.global_step.name))
+
+
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/learn/python/learn/tests/supervised_session_test.py b/tensorflow/contrib/learn/python/learn/tests/supervised_session_test.py
index ee03a565074..722333f62f1 100644
--- a/tensorflow/contrib/learn/python/learn/tests/supervised_session_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/supervised_session_test.py
@@ -30,9 +30,21 @@ from tensorflow.contrib.learn.python.learn import supervised_session
 class ScaffoldTest(tf.test.TestCase):
   """Scaffold tests."""
 
+  def test_nothing_created_before_finalize(self):
+    with tf.Graph().as_default():
+      scaffold = supervised_session.Scaffold()
+      self.assertEqual(None, scaffold.global_step_tensor)
+      self.assertEqual(None, scaffold.init_op)
+      self.assertEqual(None, scaffold.init_feed_dict)
+      self.assertEqual(None, scaffold.init_fn)
+      self.assertEqual(None, scaffold.ready_op)
+      self.assertEqual(None, scaffold.local_init_op)
+      self.assertEqual(None, scaffold.saver)
+
   def test_defaults_empty_graph(self):
     with tf.Graph().as_default():
       scaffold = supervised_session.Scaffold()
+      scaffold.finalize()
       self.assertTrue(isinstance(scaffold.global_step_tensor, tf.Variable))
       self.assertTrue(isinstance(scaffold.init_op, tf.Operation))
       self.assertEqual(None, scaffold.init_feed_dict)
@@ -49,7 +61,9 @@ class ScaffoldTest(tf.test.TestCase):
   def test_caches_values(self):
     with tf.Graph().as_default():
       scaffold1 = supervised_session.Scaffold()
+      scaffold1.finalize()
       scaffold2 = supervised_session.Scaffold()
+      scaffold2.finalize()
       self.assertEqual(scaffold1.global_step_tensor,
                        scaffold2.global_step_tensor)
       self.assertEqual(scaffold1.init_op, scaffold2.init_op)
@@ -57,6 +71,14 @@ class ScaffoldTest(tf.test.TestCase):
       self.assertEqual(scaffold1.local_init_op, scaffold2.local_init_op)
       self.assertEqual(scaffold1.saver, scaffold2.saver)
 
+  def test_raise_error_if_more_than_one_cached_item(self):
+    with tf.Graph().as_default():
+      tf.Variable([1])
+      tf.add_to_collection(tf.GraphKeys.SAVERS, tf.train.Saver())
+      tf.add_to_collection(tf.GraphKeys.SAVERS, tf.train.Saver())
+      with self.assertRaisesRegexp(RuntimeError, 'More than one item'):
+        supervised_session.Scaffold().finalize()
+
   def test_uses_passed_values(self):
     with tf.Graph().as_default():
       scaffold = supervised_session.Scaffold(global_step_tensor=1,
@@ -66,6 +88,7 @@ class ScaffoldTest(tf.test.TestCase):
                                              ready_op=5,
                                              local_init_op=6,
                                              saver=7)
+      scaffold.finalize()
       self.assertEqual(1, scaffold.global_step_tensor)
       self.assertEqual(2, scaffold.init_op)
       self.assertEqual(3, scaffold.init_feed_dict)
@@ -76,7 +99,7 @@ class ScaffoldTest(tf.test.TestCase):
 
   def test_graph_is_finalized(self):
     with tf.Graph().as_default():
-      supervised_session.Scaffold()
+      supervised_session.Scaffold().finalize()
       with self.assertRaisesRegexp(RuntimeError,
                                    'Graph is finalized and cannot be modified'):
         tf.constant([0])
@@ -206,7 +229,7 @@ class SupervisedSessionTest(tf.test.TestCase):
       # Use a monitor to save the model every 100 steps.  It also saves it at
       # the end.
       monitors = [tf.contrib.learn.monitors.CheckpointSaver(
-          100, scaffold.saver, logdir)]
+          logdir, save_steps=1, scaffold=scaffold)]
       with supervised_session.SupervisedSession('', scaffold=scaffold,
                                                 checkpoint_dir=logdir,
                                                 monitors=monitors) as session:
@@ -254,7 +277,7 @@ class SupervisedSessionTest(tf.test.TestCase):
           3, tf.errors.AbortedError(None, None, 'Abort'))
       # Save after each step.
       ckpt_monitor = tf.contrib.learn.monitors.CheckpointSaver(
-          1, scaffold.saver, logdir)
+          logdir, save_steps=1, scaffold=scaffold)
       monitors = [abort_monitor, ckpt_monitor]
       with supervised_session.SupervisedSession('', scaffold=scaffold,
                                                 checkpoint_dir=logdir,
diff --git a/tensorflow/contrib/learn/python/learn/trainable.py b/tensorflow/contrib/learn/python/learn/trainable.py
new file mode 100644
index 00000000000..de82ae6e1d9
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/trainable.py
@@ -0,0 +1,63 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""`Trainable` interface."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+
+class Trainable(object):
+  """Interface for objects that are trainable by, e.g., `Experiment`.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractmethod
+  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
+          monitors=None, max_steps=None):
+    """Trains a model given training data `x` predictions and `y` targets.
+
+    Args:
+      x: Matrix of shape [n_samples, n_features...]. Can be iterator that
+         returns arrays of features. The training input samples for fitting the
+         model. If set, `input_fn` must be `None`.
+      y: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
+         iterator that returns array of targets. The training target values
+         (class labels in classification, real numbers in regression). If set,
+         `input_fn` must be `None`.
+      input_fn: Input function. If set, `x`, `y`, and `batch_size` must be
+        `None`.
+      steps: Number of steps for which to train model. If `None`, train forever.
+        If set, `max_steps` must be `None`.
+      batch_size: minibatch size to use on the input, defaults to first
+        dimension of `x`. Must be `None` if `input_fn` is provided.
+      monitors: List of `BaseMonitor` subclass instances. Used for callbacks
+        inside the training loop.
+      max_steps: Number of total steps for which to train model. If `None`,
+        train forever. If set, `steps` must be `None`.
+
+        Two calls to `fit(steps=100)` means 200 training
+        iterations. On the other hand, two calls to `fit(max_steps=100)` means
+        that the second call will not do any iteration since first call did
+        all 100 steps.
+
+    Returns:
+      `self`, for chaining.
+    """
+    raise NotImplementedError
+
diff --git a/tensorflow/contrib/learn/python/learn/utils/checkpoints.py b/tensorflow/contrib/learn/python/learn/utils/checkpoints.py
index df0a4707b5c..b0908173d6e 100644
--- a/tensorflow/contrib/learn/python/learn/utils/checkpoints.py
+++ b/tensorflow/contrib/learn/python/learn/utils/checkpoints.py
@@ -19,246 +19,33 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
-from tensorflow.python.ops import gen_io_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import saver
-from tensorflow.python.training import training as train
-
-
-def _get_checkpoint_filename(filepattern):
-  """Returns checkpoint filename given directory or specific filepattern."""
-  if gfile.IsDirectory(filepattern):
-    return saver.latest_checkpoint(filepattern)
-  return filepattern
+from tensorflow.contrib.framework import deprecated
+from tensorflow.contrib.framework.python.framework import checkpoint_utils
 
 
+@deprecated('2016-08-22', 'Please use tf.contrib.framework.load_checkpoint '
+            'instead')
 def load_checkpoint(filepattern):
-  """Returns CheckpointReader for latest checkpoint.
-
-  Args:
-    filepattern: Directory with checkpoints file or path to checkpoint.
-
-  Returns:
-    `CheckpointReader` object.
-
-  Raises:
-    ValueError: if checkpoint_dir doesn't have 'checkpoint' file or checkpoints.
-  """
-  filename = _get_checkpoint_filename(filepattern)
-  if filename is None:
-    raise ValueError("Couldn't find 'checkpoint' file or checkpoints in "
-                     "given directory %s" % filepattern)
-  return train.NewCheckpointReader(filename)
+  """See `tf.contrib.framework.load_checkpoint`."""
+  return checkpoint_utils.load_checkpoint(filepattern)
 
 
+@deprecated('2016-08-22', 'Please use tf.contrib.framework.load_variable '
+            'instead')
 def load_variable(checkpoint_dir, name):
-  """Returns a Tensor with the contents of the given variable in the checkpoint.
-
-  Args:
-    checkpoint_dir: Directory with checkpoints file or path to checkpoint.
-    name: Name of the tensor to return.
-
-  Returns:
-    `Tensor` object.
-  """
-  # TODO(b/29227106): Fix this in the right place and remove this.
-  if name.endswith(":0"):
-    name = name[:-2]
-  reader = load_checkpoint(checkpoint_dir)
-  return reader.get_tensor(name)
+  """See `tf.contrib.framework.load_variable`."""
+  return checkpoint_utils.load_variable(checkpoint_dir, name)
 
 
+@deprecated('2016-08-22', 'Please use tf.contrib.framework.list_variables '
+            'instead')
 def list_variables(checkpoint_dir):
-  """Returns list of all variables in the latest checkpoint.
-
-  Args:
-    checkpoint_dir: Directory with checkpoints file or path to checkpoint.
-
-  Returns:
-    List of tuples `(name, shape)`.
-  """
-  reader = load_checkpoint(checkpoint_dir)
-  variable_map = reader.get_variable_to_shape_map()
-  names = sorted(variable_map.keys())
-  result = []
-  for name in names:
-    result.append((name, variable_map[name]))
-  return result
-
-
-# pylint: disable=protected-access
-# Currently variable_scope doesn't provide very good APIs to access
-# all variables under scope and retrieve and check existing scopes.
-# TODO(ipolosukhin): Refactor variable_scope module to provide nicer APIs.
-
-
-def _set_checkpoint_initializer(variable, file_pattern, tensor_name, slice_spec,
-                                name="checkpoint_initializer"):
-  """Sets variable initializer to assign op form value in checkpoint's tensor.
-
-  Args:
-    variable: `Variable` object.
-    file_pattern: string, where to load checkpoints from.
-    tensor_name: Name of the `Tensor` to load from checkpoint reader.
-    slice_spec: Slice specification for loading partitioned variables.
-    name: Name of the operation.
-  """
-  base_type = variable.dtype.base_dtype
-  restore_op = gen_io_ops._restore_slice(
-      file_pattern,
-      tensor_name,
-      slice_spec,
-      base_type,
-      preferred_shard=-1,
-      name=name)
-  variable._initializer_op = state_ops.assign(variable, restore_op)
-
-
-def _set_variable_or_list_initializer(variable_or_list, file_pattern,
-                                      tensor_name):
-  if isinstance(variable_or_list, (list, tuple)):
-    # A set of slices.
-    slice_name = None
-    for v in variable_or_list:
-      if slice_name is None:
-        slice_name = v._save_slice_info.full_name
-      elif slice_name != v._save_slice_info.full_name:
-        raise ValueError("Slices must all be from the same tensor: %s != %s" %
-                         (slice_name, v._save_slice_info.full_name))
-      _set_checkpoint_initializer(v, file_pattern, tensor_name,
-                                  v._save_slice_info.spec)
-  else:
-    _set_checkpoint_initializer(variable_or_list, file_pattern, tensor_name, "")
+  """See `tf.contrib.framework.list_variables`."""
+  return checkpoint_utils.list_variables(checkpoint_dir)
 
 
+@deprecated('2016-08-22', 'Please use tf.contrib.framework.init_from_checkpoint'
+            ' instead')
 def init_from_checkpoint(checkpoint_dir, assignment_map):
-  """Using assingment map initializes current variables with loaded tensors.
-
-  Note: This overrides default initialization ops of specified variables and
-  redefines dtype.
-
-  Assignment map supports next syntax:
-    `'scope_name/': 'checkpoint_scope_name/'` - will load all variables in
-      current `scope_name` from `checkpoint_scope_name` with matching variable
-      names.
-    `'scope_name/variable_name': 'checkpoint_scope_name/some_other_variable'` -
-      will initalize `scope_name/variable_name` variable
-      from `checkpoint_scope_name/some_other_variable`.
-    `variable: 'scope_varaible_name'` - will initialize given variable with
-      variable from the checkpoint.
-    `'scope_name/': '/'` - will load all variables in current `scope_name` from
-      checkpoint's root (e.g. no scope).
-
-  Supports loading into partitioned variables, which are represented as
-  '<variable>/part_<part #>'.
-
-  Example:
-  ```python
-    # Create variables.
-    with tf.variable_scope('test'):
-      m = tf.get_variable('my_var')
-    with tf.variable_scope('test2'):
-      var2 = tf.get_variable('my_var')
-    ...
-    # Specify which variables to intialize from checkpoint.
-    init_from_checkpoint(checkpoint_dir, {
-      'test/my_var': 'some_var',
-      'test2/', 'some_scope/'})
-    ...
-    # Or use `Variable` objects to identify what to initialize.
-    init_from_checkpoint(checkpoint_dir, {
-      var2: 'some_scope/var2',
-    })
-    ...
-    # Initialize variables as usual.
-    session.run(tf.get_all_variables())
-  ```
-
-  Args:
-    checkpoint_dir: Directory with checkpoints file or path to checkpoint.
-    assignment_map: Dict, where keys are names of current variables
-                    (in default graph) and values are names of the variables
-                    in the checkpoint.
-
-  Raises:
-    tf.errors.OpError: If missing checkpoints or tensors in checkpoints.
-    ValueError: If missing variables in current graph.
-  """
-  filepattern = _get_checkpoint_filename(checkpoint_dir)
-  reader = load_checkpoint(checkpoint_dir)
-  variable_map = reader.get_variable_to_shape_map()
-  for current_name, tensor_name in six.iteritems(assignment_map):
-    scopes = ""
-    var = None
-    # Check if this is Variable object.
-    if isinstance(current_name, variables.Variable):
-      var = current_name
-    else:
-      var_scope = vs._get_default_variable_store()
-      # Check if this is variable in var_store.
-      var = var_scope._vars.get(current_name, None)
-      # Also check if variable is partitioned as list.
-      if var is None:
-        if current_name + "/part_0" in var_scope._vars:
-          var = []
-          i = 0
-          while current_name + "/part_%d" % i in var_scope._vars:
-            var.append(var_scope._vars[current_name + "/part_%d" % i])
-            i += 1
-    if var is not None:
-      # If 1 to 1 mapping was provided, find variable in the scope.
-      if tensor_name not in variable_map:
-        raise ValueError("Tensor %s is not found in %s checkpoint" % (
-            tensor_name, checkpoint_dir
-        ))
-      if isinstance(var, variables.Variable):
-        # Additional at-call-time checks.
-        if not var.get_shape().is_compatible_with(variable_map[tensor_name]):
-          raise ValueError(
-              "Shape of variable %s (%s) doesn't match with shape of "
-              "tensor %s (%s) from checkpoint reader." % (
-                  var.name, str(var.get_shape()),
-                  tensor_name, str(variable_map[tensor_name])
-              ))
-      _set_variable_or_list_initializer(var, filepattern, tensor_name)
-      logging.info("Initialize variable %s from checkpoint %s with %s" % (
-          current_name, checkpoint_dir, tensor_name
-      ))
-    else:
-      if "/" in current_name:
-        scopes = current_name[:current_name.rindex("/")]
-        current_name = current_name[current_name.rindex("/") + 1:]
-      if not tensor_name.endswith("/"):
-        raise ValueError(
-            "Assignment map with scope only name (%s) "
-            "should map to scope only (%s). "
-            "Should be 'scope/': 'other_scope/'." % (
-                scopes, tensor_name
-            ))
-      # If scope to scope mapping was provided, find all variables in the scope.
-      for var_name in var_scope._vars:
-        if var_name.startswith(scopes):
-          # Lookup name with specified prefix and suffix from current variable.
-          # If tensor_name given is '/' (root), don't use it for full name.
-          if tensor_name != "/":
-            full_tensor_name = tensor_name + var_name[len(scopes) + 1:]
-          else:
-            full_tensor_name = var_name[len(scopes) + 1:]
-          if full_tensor_name not in variable_map:
-            raise ValueError(
-                "Tensor %s (%s in %s) is not found in %s checkpoint" % (
-                    full_tensor_name, var_name[len(scopes) + 1:], tensor_name,
-                    checkpoint_dir
-                ))
-          var = var_scope._vars[var_name]
-          _set_variable_or_list_initializer(var, filepattern, full_tensor_name)
-          logging.info("Initialize variable %s from checkpoint %s with %s" % (
-              var_name, checkpoint_dir, tensor_name
-          ))
-# pylint: enable=protected-access
+  """See `tf.contrib.framework.init_from_checkpoint`."""
+  checkpoint_utils.init_from_checkpoint(checkpoint_dir, assignment_map)
diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD
index bee04e5c51c..6035b55d0a2 100644
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@@ -59,10 +59,9 @@ py_library(
 
 py_test(
     name = "sdca_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/sdca_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notap"],
     deps = [
         ":sdca_ops_py",
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/contrib/linear_optimizer/kernels/sdca_ops.cc b/tensorflow/contrib/linear_optimizer/kernels/sdca_ops.cc
index 8f01552defc..49d0cdd98c6 100644
--- a/tensorflow/contrib/linear_optimizer/kernels/sdca_ops.cc
+++ b/tensorflow/contrib/linear_optimizer/kernels/sdca_ops.cc
@@ -61,8 +61,9 @@ using UnalignedInt64Vector = TTypes<const int64>::UnalignedConstVec;
 
 // Statistics computed with input (ModelWeights, Example).
 struct ExampleStatistics {
-  // feature_weights dot feature_values for the example
+  // feature_weights dot feature_values for the example.
   double wx = 0;
+
   // sum of squared feature values occurring in the example divided by
   // L2 * sum(example_weights).
   double normalized_squared_norm = 0;
@@ -76,21 +77,26 @@ class Regularizations {
   Status Initialize(OpKernelConstruction* const context) {
     TF_RETURN_IF_ERROR(context->GetAttr("l1", &symmetric_l1_));
     TF_RETURN_IF_ERROR(context->GetAttr("l2", &symmetric_l2_));
-    shrinkage_factor_ = symmetric_l1_ / symmetric_l2_;
+    shrinkage_ = symmetric_l1_ / symmetric_l2_;
     return Status::OK();
   }
 
   // Proximal SDCA shrinking for L1 regularization.
   double Shrink(const double weight) const {
-    const double shrink_weight =
-        std::max(std::abs(weight) - shrinkage_factor_, 0.0);
-    if (shrink_weight > 0.0) {
-      return std::copysign(shrink_weight, weight);
+    const double shrinked = std::max(std::abs(weight) - shrinkage_, 0.0);
+    if (shrinked > 0.0) {
+      return std::copysign(shrinked, weight);
     }
     return 0.0;
   }
 
-  float shrinkage_factor() const { return shrinkage_factor_; }
+  // Vectorized float variant of the above.
+  Eigen::Tensor<float, 1, Eigen::RowMajor> EigenShrink(
+      const Eigen::Tensor<float, 1, Eigen::RowMajor> weights) const {
+    // Proximal step on the weights which is sign(w)*|w - shrinkage|+.
+    return weights.sign() * ((weights.abs() - weights.constant(shrinkage_))
+                                 .cwiseMax(weights.constant(0.0)));
+  }
 
   float symmetric_l2() const { return symmetric_l2_; }
 
@@ -98,42 +104,29 @@ class Regularizations {
   float symmetric_l1_ = 0;
   float symmetric_l2_ = 0;
 
-  // L1 divided by L2, precomputed for use during weight shrinking.
-  double shrinkage_factor_ = 0;
+  // L1 divided by L2, pre-computed for use during weight shrinking.
+  double shrinkage_ = 0;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Regularizations);
 };
 
-// A dense vector which is a row-slice of the underlying matrix.
-struct DenseVector {
-  // Returns a row slice from the matrix.
-  inline Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>> row()
-      const {
-    // TensorMap to a row slice of the matrix.
-    return Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>>(
-        data_matrix.data() + row_index * data_matrix.dimension(1),
-        data_matrix.dimension(1));
-  }
-
-  const TTypes<float>::ConstMatrix data_matrix;
-  const int row_index;
-};
-
 class ModelWeights;
 
 // Struct describing a single example.
 class Example {
  public:
-  float example_label() const { return example_label_; }
-  float example_weight() const { return example_weight_; }
-  double squared_norm() const { return squared_norm_; }
-
   // Compute dot product between weights, and example feature values. This
   // method also computes the normalized example norm used in SDCA update.
   const ExampleStatistics ComputeWxAndWeightedExampleNorm(
-      const int num_partitions, const ModelWeights& weights,
+      const int num_partitions, const ModelWeights& model_weights,
       const Regularizations& regularization) const;
 
+  float example_label() const { return example_label_; }
+
+  float example_weight() const { return example_weight_; }
+
+  double squared_norm() const { return squared_norm_; }
+
  private:
   // Sparse features associated with the example.
   // Indices and Values are the associated feature index, and values. Values
@@ -144,7 +137,23 @@ class Example {
     std::unique_ptr<UnalignedFloatVector> values;  // nullptr encodes optional.
   };
   std::vector<SparseFeatures> sparse_features_;
-  std::vector<std::unique_ptr<DenseVector>> dense_values_;
+
+  // A dense vector which is a row-slice of the underlying matrix.
+  struct DenseVector {
+    // Returns a row slice from the matrix.
+    Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>> row()
+        const {
+      // TensorMap to a row slice of the matrix.
+      return Eigen::TensorMap<Eigen::Tensor<const float, 1, Eigen::RowMajor>>(
+          data_matrix.data() + row_index * data_matrix.dimension(1),
+          data_matrix.dimension(1));
+    }
+
+    const TTypes<float>::ConstMatrix data_matrix;
+    const int64 row_index;
+  };
+  std::vector<std::unique_ptr<DenseVector>> dense_vectors_;
+
   float example_label_ = 0;
   float example_weight_ = 0;
   double squared_norm_ = 0;  // sum squared norm of the features.
@@ -162,28 +171,32 @@ class ModelWeights {
  public:
   ModelWeights() {}
 
+  // Go through all the features present in the example, and update the
+  // weights based on the dual delta.
   void UpdateDeltaWeights(const Eigen::ThreadPoolDevice& device,
-                          const Example& example, const double dual_delta,
-                          const Regularizations& regularization) {
-    // Go through all the features present in the example, and update the
-    // weights based on the dual delta.
-    for (int j = 0; j < sparse_weights_.size(); ++j) {
+                          const Example& example,
+                          const double normalized_bounded_dual_delta) {
+    // Sparse weights.
+    for (size_t j = 0; j < sparse_weights_.size(); ++j) {
       const Example::SparseFeatures& sparse_features =
           example.sparse_features_[j];
-      for (int k = 0; k < sparse_features.indices->size(); ++k) {
-        double delta_w = dual_delta / regularization.symmetric_l2();
-        if (sparse_features.values) {
-          delta_w *= (*sparse_features.values)(k);
-        }
-        sparse_delta_weights_[j]((*sparse_features.indices)(k)) += delta_w;
+      FeatureWeights* const feature_weights = &sparse_weights_[j];
+      for (int64 k = 0; k < sparse_features.indices->size(); ++k) {
+        const double feature_value = sparse_features.values == nullptr
+                                         ? 1.0
+                                         : (*sparse_features.values)(k);
+        feature_weights->deltas((*sparse_features.indices)(k)) +=
+            feature_value * normalized_bounded_dual_delta;
       }
     }
-    for (int j = 0; j < dense_weights_.size(); ++j) {
-      TTypes<float>::Vec w = dense_delta_weights_[j];
-      w.device(device) =
-          w +
-          (example.dense_values_[j]->row()) *
-              w.constant(dual_delta / regularization.symmetric_l2());
+
+    // Dense weights.
+    for (size_t j = 0; j < dense_weights_.size(); ++j) {
+      const Example::DenseVector& dense_vector = *example.dense_vectors_[j];
+      TTypes<float>::Vec deltas = dense_weights_[j].deltas;
+      deltas.device(device) =
+          deltas +
+          dense_vector.row() * deltas.constant(normalized_bounded_dual_delta);
     }
   }
 
@@ -206,23 +219,22 @@ class ModelWeights {
     // Reads in the weights, and allocates and initializes the delta weights.
     const auto intialize_weights = [&](
         const OpInputList& weight_inputs, OpOutputList* const weight_outputs,
-        std::vector<TTypes<const float>::Vec>* const weights,
-        std::vector<TTypes<float>::Vec>* const delta_weights) {
-
+        std::vector<FeatureWeights>* const feature_weights) {
       for (int i = 0; i < weight_inputs.size(); ++i) {
-        weights->push_back(weight_inputs[i].flat<float>());
         Tensor* delta_t;
         weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t);
-        auto delta_vec = delta_t->flat<float>();
-        delta_vec.setZero();
-        delta_weights->push_back(delta_vec);
+        auto deltas = delta_t->flat<float>();
+        deltas.setZero();
+        feature_weights->emplace_back(
+            FeatureWeights{weight_inputs[i].flat<float>(), deltas});
       }
     };
 
     intialize_weights(sparse_weights_inputs, &sparse_weights_outputs,
-                      &sparse_weights_, &sparse_delta_weights_);
+                      &sparse_weights_);
     intialize_weights(dense_weights_inputs, &dense_weights_outputs,
-                      &dense_weights_, &dense_delta_weights_);
+                      &dense_weights_);
+
     return Status::OK();
   }
 
@@ -230,11 +242,18 @@ class ModelWeights {
   // TODO(sibyl-Aix6ihai): Refactor this to support both small-batch mode, and large
   // batch mode, where we use sparse storage (hashmap) vs dense storage
   // (vectors).
-  // Weights for each of the feature groups.
-  std::vector<TTypes<const float>::Vec> sparse_weights_;
-  std::vector<TTypes<float>::Vec> sparse_delta_weights_;
-  std::vector<TTypes<const float>::Vec> dense_weights_;
-  std::vector<TTypes<float>::Vec> dense_delta_weights_;
+
+  // Weights relate to a feature group.
+  struct FeatureWeights {
+    // The nominal value of the weight for a feature (indexed by its id).
+    TTypes<const float>::Vec nominals;
+
+    // The accumulated delta weight for a feature (indexed by its id).
+    TTypes<float>::Vec deltas;
+  };
+
+  std::vector<FeatureWeights> sparse_weights_;
+  std::vector<FeatureWeights> dense_weights_;
 
   // Example requires ModelWeights to compute the ExampleStatistics.
   friend class Example;
@@ -243,41 +262,48 @@ class ModelWeights {
 };
 
 const ExampleStatistics Example::ComputeWxAndWeightedExampleNorm(
-    const int num_partitions, const ModelWeights& weights,
+    const int num_partitions, const ModelWeights& model_weights,
     const Regularizations& regularization) const {
   ExampleStatistics result;
+
   result.normalized_squared_norm =
       squared_norm_ / regularization.symmetric_l2();
 
-  const int num_sparse_features = weights.sparse_weights_.size();
   // Compute the w \dot x.
-  for (int j = 0; j < num_sparse_features; ++j) {
+
+  // Sparse features contribution.
+  for (size_t j = 0; j < sparse_features_.size(); ++j) {
     const Example::SparseFeatures& sparse_features = sparse_features_[j];
-    const int num_features = sparse_features.indices->size();
-    for (int k = 0; k < num_features; ++k) {
-      const int feature_index = (*sparse_features.indices)(k);
-      const float w = regularization.Shrink(
-          (weights.sparse_weights_[j](feature_index) +
-           num_partitions * weights.sparse_delta_weights_[j](feature_index)));
-      if (sparse_features.values) {
-        result.wx += (*sparse_features.values)(k)*w;
-      } else {
-        result.wx += w;
-      }
+    const ModelWeights::FeatureWeights& sparse_weights =
+        model_weights.sparse_weights_[j];
+
+    for (int64 k = 0; k < sparse_features.indices->size(); ++k) {
+      const int64 feature_index = (*sparse_features.indices)(k);
+      const double feature_value = sparse_features.values == nullptr
+                                       ? 1.0
+                                       : (*sparse_features.values)(k);
+      const double feature_weight =
+          sparse_weights.nominals(feature_index) +
+          sparse_weights.deltas(feature_index) * num_partitions;
+      result.wx += feature_value * regularization.Shrink(feature_weight);
     }
   }
 
-  for (int j = 0; j < weights.dense_weights_.size(); ++j) {
-    auto w = (weights.dense_weights_[j] +
-              weights.dense_delta_weights_[j] *
-                  weights.dense_delta_weights_[j].constant(num_partitions));
+  // Dense features contribution.
+  for (size_t j = 0; j < dense_vectors_.size(); ++j) {
+    const Example::DenseVector& dense_vector = *dense_vectors_[j];
+    const ModelWeights::FeatureWeights& dense_weights =
+        model_weights.dense_weights_[j];
+
+    const Eigen::Tensor<float, 1, Eigen::RowMajor> feature_weights =
+        dense_weights.nominals +
+        dense_weights.deltas * dense_weights.deltas.constant(num_partitions);
     const Eigen::Tensor<float, 0, Eigen::RowMajor> prediction =
-        ((dense_values_[j]->row()) *
-         (w.sign() * ((w.abs() - w.constant(regularization.shrinkage_factor()))
-                          .cwiseMax(w.constant(0.0)))))
+        (dense_vector.row() * regularization.EigenShrink(feature_weights))
             .sum();
     result.wx += prediction();
   }
+
   return result;
 }
 
@@ -286,13 +312,14 @@ class Examples {
  public:
   Examples() {}
 
-  // Returns features for example at |example_index|.
+  // Returns the Example at |example_index|.
   const Example& example(const int example_index) const {
     return examples_.at(example_index);
   }
 
   int num_examples() const { return examples_.size(); }
-  int num_columns() const { return num_columns_; }
+
+  int num_features() const { return num_features_; }
 
   // Initialize() must be called immediately after construction.
   // TODO(sibyl-Aix6ihai): Refactor/shorten this function.
@@ -300,7 +327,8 @@ class Examples {
                     const int num_sparse_features,
                     const int num_sparse_features_with_values,
                     const int num_dense_features) {
-    num_columns_ = num_sparse_features + num_dense_features;
+    num_features_ = num_sparse_features + num_dense_features;
+
     OpInputList sparse_example_indices_inputs;
     TF_RETURN_IF_ERROR(context->input_list("sparse_example_indices",
                                            &sparse_example_indices_inputs));
@@ -329,9 +357,9 @@ class Examples {
     examples_.clear();
     examples_.resize(num_examples);
     for (int example_id = 0; example_id < num_examples; ++example_id) {
-      Example* example = &examples_[example_id];
+      Example* const example = &examples_[example_id];
       example->sparse_features_.resize(num_sparse_features);
-      example->dense_values_.resize(num_dense_features);
+      example->dense_vectors_.resize(num_dense_features);
       example->example_weight_ = example_weights(example_id);
       example->example_label_ = example_labels(example_id);
     }
@@ -359,7 +387,7 @@ class Examples {
             }
             if (start_id < example_indices.size() &&
                 example_indices(start_id) == example_id) {
-              Example::SparseFeatures* sparse_features =
+              Example::SparseFeatures* const sparse_features =
                   &examples_[example_id].sparse_features_[i];
               sparse_features->indices.reset(new UnalignedInt64Vector(
                   &(feature_indices(start_id)), end_id - start_id));
@@ -370,7 +398,7 @@ class Examples {
                     &(feature_weights(start_id)), end_id - start_id));
               }
             } else {
-              Example::SparseFeatures* sparse_features =
+              Example::SparseFeatures* const sparse_features =
                   &examples_[example_id].sparse_features_[i];
               // Add a Tensor that has size 0.
               sparse_features->indices.reset(
@@ -396,8 +424,8 @@ class Examples {
       Shard(worker_threads.num_threads, worker_threads.workers,
             num_sparse_features, num_examples, parse_partition);
     }
-    // Parse dense.
-    {
+
+    {  // Parse dense.
       auto parse_partition = [&](const int64 begin, const int64 end) {
         // The static_cast here is safe since begin and end can be at most
         // num_examples which is an int.
@@ -405,8 +433,8 @@ class Examples {
           auto dense_features =
               dense_features_inputs[i].template matrix<float>();
           for (int example_id = 0; example_id < num_examples; ++example_id) {
-            examples_[example_id].dense_values_[i].reset(
-                new DenseVector{dense_features, example_id});
+            examples_[example_id].dense_vectors_[i].reset(
+                new Example::DenseVector{dense_features, example_id});
           }
         }
       };
@@ -416,16 +444,17 @@ class Examples {
       Shard(worker_threads.num_threads, worker_threads.workers,
             num_dense_features, kCostPerUnit, parse_partition);
     }
-    // Compute norm of examples.
-    {
+
+    {  // Compute norm of examples.
       auto compute_example_norm = [&](const int64 begin, const int64 end) {
         // The static_cast here is safe since begin and end can be at most
         // num_examples which is an int.
-        for (int i = static_cast<int>(begin); i < end; ++i) {
+        for (int example_id = static_cast<int>(begin); example_id < end;
+             ++example_id) {
           double squared_norm = 0;
           for (int j = 0; j < num_sparse_features; ++j) {
             const Example::SparseFeatures& sparse_features =
-                examples_[i].sparse_features_[j];
+                examples_[example_id].sparse_features_[j];
             if (sparse_features.values) {
               const Eigen::Tensor<float, 0, Eigen::RowMajor> sn =
                   sparse_features.values->square().sum();
@@ -436,10 +465,10 @@ class Examples {
           }
           for (int j = 0; j < num_dense_features; ++j) {
             const Eigen::Tensor<float, 0, Eigen::RowMajor> sn =
-                examples_[i].dense_values_[j]->row().square().sum();
+                examples_[example_id].dense_vectors_[j]->row().square().sum();
             squared_norm += sn();
           }
-          examples_[i].squared_norm_ = squared_norm;
+          examples_[example_id].squared_norm_ = squared_norm;
         }
       };
       // TODO(sibyl-Aix6ihai): Compute the cost optimally.
@@ -455,7 +484,8 @@ class Examples {
  private:
   // All examples in the batch.
   std::vector<Example> examples_;
-  int num_columns_;
+
+  int num_features_ = 0;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Examples);
 };
@@ -478,7 +508,6 @@ class DistributedSdcaLargeBatchSolver : public OpKernel {
       OP_REQUIRES(context, false, errors::InvalidArgument(
                                       "Unsupported loss type: ", loss_type));
     }
-
     OP_REQUIRES_OK(context, context->GetAttr("num_sparse_features",
                                              &num_sparse_features_));
     OP_REQUIRES_OK(context,
@@ -558,9 +587,11 @@ class DistributedSdcaLargeBatchSolver : public OpKernel {
             primal_loss, dual_loss);
 
         // Compute new weights.
-        const double bounded_dual_delta = (new_dual - dual) * example_weight;
+        const double normalized_bounded_dual_delta =
+            (new_dual - dual) * example_weight /
+            regularizations_.symmetric_l2();
         model_weights.UpdateDeltaWeights(context->eigen_cpu_device(), example,
-                                         bounded_dual_delta, regularizations_);
+                                         normalized_bounded_dual_delta);
 
         // Update example data.
         example_state_data(example_index, 0) = new_dual;
@@ -571,7 +602,8 @@ class DistributedSdcaLargeBatchSolver : public OpKernel {
     };
     // TODO(sibyl-Aix6ihai): Tune this properly based on sparsity of the data,
     // number of cpus, and cost per example.
-    const int64 kCostPerUnit = examples.num_examples() * examples.num_columns();
+    const int64 kCostPerUnit =
+        examples.num_examples() * examples.num_features();
     const DeviceBase::CpuWorkerThreads& worker_threads =
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers,
@@ -584,11 +616,11 @@ class DistributedSdcaLargeBatchSolver : public OpKernel {
   // template the entire class to avoid the virtual table lookup penalty in
   // the inner loop.
   std::unique_ptr<DualLossUpdater> loss_updater_;
-  int num_sparse_features_;
-  int num_sparse_features_with_values_;
-  int num_dense_features_;
-  int num_inner_iterations_;
-  int num_partitions_;
+  int num_sparse_features_ = 0;
+  int num_sparse_features_with_values_ = 0;
+  int num_dense_features_ = 0;
+  int num_inner_iterations_ = 0;
+  int num_partitions_ = 0;
   Regularizations regularizations_;
 };
 REGISTER_KERNEL_BUILDER(
@@ -612,15 +644,14 @@ class SdcaShrinkL1 : public OpKernel {
                                                         &dense_weights_inputs));
 
     auto shrink_l1 = [&](OpMutableInputList* const inputs) {
+      // TODO(sibyl-Mooth6ku): Maybe parallelize this.
       for (int i = 0; i < inputs->size(); ++i) {
         auto prox_w = inputs->at(i, /*lock_held=*/true).flat<float>();
         prox_w.device(context->eigen_cpu_device()) =
-            prox_w.sign() *
-            ((prox_w.abs() -
-              prox_w.constant(regularizations_.shrinkage_factor()))
-                 .cwiseMax(prox_w.constant(0.0)));
+            regularizations_.EigenShrink(prox_w);
       }
     };
+
     // Shrink both sparse, and dense weights.
     shrink_l1(&sparse_weights_inputs);
     shrink_l1(&dense_weights_inputs);
diff --git a/tensorflow/contrib/losses/python/losses/__init__.py b/tensorflow/contrib/losses/python/losses/__init__.py
index 081d47e4b55..d8181632bf8 100644
--- a/tensorflow/contrib/losses/python/losses/__init__.py
+++ b/tensorflow/contrib/losses/python/losses/__init__.py
@@ -106,6 +106,7 @@ weighted average over the individual prediction errors:
 
 @@absolute_difference
 @@add_loss
+@@hinge_loss
 @@cosine_distance
 @@get_losses
 @@get_regularization_losses
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 99aab8b44c2..597e6aeda93 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
 
 
 __all__ = ["absolute_difference",
@@ -33,6 +34,7 @@ __all__ = ["absolute_difference",
            "get_losses",
            "get_regularization_losses",
            "get_total_loss",
+           "hinge_loss",
            "log_loss",
            "sigmoid_cross_entropy",
            "softmax_cross_entropy",
@@ -410,6 +412,31 @@ def log_loss(predictions, targets, weight=1.0, epsilon=1e-7, scope=None):
     return _compute_weighted_loss(losses, weight)
 
 
+def hinge_loss(logits, target, scope=None):
+  """Method that returns the loss tensor for hinge loss.
+
+  Args:
+    logits: The logits, a float tensor.
+    target: The ground truth output tensor. Its shape should match the shape of
+      logits. The values of the tensor are expected to be 0.0 or 1.0.
+    scope: The scope for the operations performed in computing the loss.
+
+  Returns:
+    A `Tensor` of same shape as logits and target representing the loss values
+      across the batch.
+
+  Raises:
+    ValueError: If the shapes of `logits` and `target` don't match.
+  """
+  with ops.op_scope([logits, target], scope, "hinge_loss") as scope:
+    logits.get_shape().assert_is_compatible_with(target.get_shape())
+    # We first need to convert binary labels to -1/1 labels (as floats).
+    target = math_ops.to_float(target)
+    all_ones = array_ops.ones_like(target)
+    labels = math_ops.sub(2 * target, all_ones)
+    return nn_ops.relu(math_ops.sub(all_ones, math_ops.mul(labels, logits)))
+
+
 def sum_of_squares(predictions, targets, weight=1.0, scope=None):
   """Adds a Sum-of-Squares loss to the training procedure.
 
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops_test.py b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
index 49460ec2279..824c24451be 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops_test.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
@@ -499,6 +499,42 @@ class LogLossTest(tf.test.TestCase):
       self.assertAlmostEqual(0.0, loss.eval(), 3)
 
 
+class HingeLossTest(tf.test.TestCase):
+
+  def testIncompatibleShapes(self):
+    with self.test_session():
+      logits = tf.constant([[-1.0], [2.1]])
+      target = tf.constant([0.0, 1.0])
+      with self.assertRaises(ValueError):
+        _ = tf.contrib.losses.hinge_loss(logits, target).eval()
+
+  def testAllOutsideMargin(self):
+    with self.test_session():
+      logits = tf.constant([1.2, -1.4, -1.0, 2.1])
+      target = tf.constant([1.0, 0.0, 0.0, 1.0])
+      loss = tf.contrib.losses.hinge_loss(logits, target)
+      self.assertAllClose(loss.eval(), [0.0, 0.0, 0.0, 0.0], atol=1e-3)
+
+  def testSomeInsideMargin(self):
+    with self.test_session():
+      logits = tf.constant([[-0.7], [-1.4], [1.4], [0.6]])
+      target = tf.constant([[0.0], [0.0], [1.0], [1.0]])
+      loss = tf.contrib.losses.hinge_loss(logits, target)
+      # Examples 1 and 4 are on the correct side of the hyperplane but within
+      # the margin so they incur some (small) loss.
+      self.assertAllClose(loss.eval(), [[0.3], [0.0], [0.0], [0.4]], atol=1e-3)
+
+  def testSomeMisclassified(self):
+    with self.test_session():
+      logits = tf.constant([[[1.2], [0.4], [-1.0], [-1.1]]])
+      target = tf.constant([[[1.0], [0.0], [0.0], [1.0]]])
+      loss = tf.contrib.losses.hinge_loss(logits, target)
+      # Examples 2 and 4 are on the wrong side of the hyperplane so they incur
+      # some (fairly large) loss.
+      self.assertAllClose(
+          loss.eval(), [[[0.0], [1.4], [0.0], [2.1]]], atol=1e-3)
+
+
 class SumOfSquaresLossTest(tf.test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index bad3e07d72e..4987e9bcd40 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -393,10 +393,8 @@ $(wildcard tensorflow/core/graph/dot.*) \
 $(wildcard tensorflow/core/lib/gif/*) \
 $(wildcard tensorflow/core/lib/jpeg/*) \
 $(wildcard tensorflow/core/lib/png/*) \
-$(wildcard tensorflow/core/util/checkpoint_reader.*) \
 $(wildcard tensorflow/core/util/events_writer.*) \
 $(wildcard tensorflow/core/util/reporter.*) \
-$(wildcard tensorflow/core/util/tf_status_helper.*) \
 $(wildcard tensorflow/core/platform/default/stream_executor.*) \
 $(wildcard tensorflow/core/platform/default/test_benchmark.*) \
 $(wildcard tensorflow/core/platform/cuda.h) \
diff --git a/tensorflow/contrib/metrics/ops/set_ops.cc b/tensorflow/contrib/metrics/ops/set_ops.cc
index 72eb352a460..0db12d05a7c 100644
--- a/tensorflow/contrib/metrics/ops/set_ops.cc
+++ b/tensorflow/contrib/metrics/ops/set_ops.cc
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
 namespace tensorflow {
 
+using shape_inference::InferenceContext;
+
 REGISTER_OP("SetSize")
     .Input("set_indices: int64")
     .Input("set_values: T")
@@ -24,6 +28,7 @@ REGISTER_OP("SetSize")
     .Attr("validate_indices: bool = true")
     .Attr("T: {int8, int16, int32, int64, uint8, uint16, string}")
     .Output("size: int32")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Number of unique elements along last dimension of input `set`.
 
@@ -51,6 +56,12 @@ REGISTER_OP("DenseToDenseSetOperation")
     .Output("result_indices: int64")
     .Output("result_values: T")
     .Output("result_shape: int64")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->Dim(c->input(0), 0), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Applies set operation along last dimension of 2 `Tensor` inputs.
 
@@ -84,6 +95,12 @@ REGISTER_OP("DenseToSparseSetOperation")
     .Output("result_indices: int64")
     .Output("result_values: T")
     .Output("result_shape: int64")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->Dim(c->input(0), 0), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Applies set operation along last dimension of `Tensor` and `SparseTensor`.
 
@@ -132,6 +149,12 @@ REGISTER_OP("SparseToSparseSetOperation")
     .Output("result_indices: int64")
     .Output("result_values: T")
     .Output("result_shape: int64")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Applies set operation along last dimension of 2 `SparseTensor` inputs.
 
diff --git a/tensorflow/contrib/metrics/python/kernel_tests/histogram_ops_test.py b/tensorflow/contrib/metrics/python/kernel_tests/histogram_ops_test.py
index 15fa107d4fb..6ba3b11f3ba 100644
--- a/tensorflow/contrib/metrics/python/kernel_tests/histogram_ops_test.py
+++ b/tensorflow/contrib/metrics/python/kernel_tests/histogram_ops_test.py
@@ -163,7 +163,7 @@ class AUCUsingHistogramTest(tf.test.TestCase):
                                           self.rng, frac_true)
       # Fetch current auc, and verify that fetching again doesn't change it.
       auc_eval = auc.eval()
-      self.assertEqual(auc_eval, auc.eval())
+      self.assertAlmostEqual(auc_eval, auc.eval(), places=5)
 
     msg = ('nbins: %s, desired_auc: %s, score_range: %s, '
            'num_records: %s, frac_true: %s, num_updates: %s') % (nbins,
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 42a4cacf4e6..6754052a7f5 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -2427,11 +2427,11 @@ class StreamingMeanIOUTest(tf.test.TestCase):
       tf.contrib.metrics.streaming_mean_iou(
           predictions, labels, num_classes=2, ignore_mask=ignore_mask)
 
-  def testIgnoreMaskIsNotBooleanRaisesValueError(self):
+  def testIgnoreMaskIsNotBooleanRaisesTypeError(self):
     predictions = tf.ones([10])
     labels = tf.ones([10])
     ignore_mask = tf.ones([10])
-    with self.assertRaises(ValueError):
+    with self.assertRaises(TypeError):
       tf.contrib.metrics.streaming_mean_iou(
           predictions, labels, num_classes=2, ignore_mask=ignore_mask)
 
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 916fb0a1720..0671cb3d809 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -24,6 +24,7 @@ py_test(
     deps = [
         ":opt_py",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python:extra_py_tests_deps",
     ],
 )
 
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py
index 0627f5db66b..7629662b079 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
 
 
 __all__ = ['ExternalOptimizerInterface', 'ScipyOptimizerInterface']
@@ -117,24 +118,24 @@ class ExternalOptimizerInterface(object):
       step_callback: A function to be called at each optimization step;
         arguments are the current values of all optimization variables
         flattened into a single vector.
-      loss_callback: A function to be called every time the loss is computed,
-        with evaluated fetches supplied as positional arguments.
-      grad_callback: A function to be called every time the loss gradient is
-        computed, with evaluated fetches supplied as positional arguments.
+      loss_callback: A function to be called every time the loss and gradients
+        are computed, with evaluated fetches supplied as positional arguments.
+      grad_callback: Deprecated.
     """
     session = session or ops.get_default_session()
     feed_dict = feed_dict or {}
     fetches = fetches or []
 
     loss_callback = loss_callback or (lambda *fetches: None)
-    grad_callback = grad_callback or (lambda *fetches: None)
     step_callback = step_callback or (lambda xk: None)
+    # TODO(chapelle): Remove grad_callback (b/30590858)
+    if grad_callback:
+      logging.warn('grad_callback is deprecated. Please use loss_callback.')
 
     # Construct loss function and associated gradient.
-    loss_func = self._make_eval_func(
-        self._loss, session, feed_dict, fetches, loss_callback)
     loss_grad_func = self._make_eval_func(
-        self._packed_loss_grad, session, feed_dict, fetches, grad_callback)
+        [self._loss, self._packed_loss_grad],
+        session, feed_dict, fetches, loss_callback)
 
     # Construct equality constraint functions and associated gradients.
     equality_funcs = self._make_eval_funcs(
@@ -153,8 +154,8 @@ class ExternalOptimizerInterface(object):
 
     # Perform minimization.
     packed_var_val = self._minimize(
-        initial_val=initial_packed_var_val, loss_func=loss_func,
-        loss_grad_func=loss_grad_func, equality_funcs=equality_funcs,
+        initial_val=initial_packed_var_val, loss_grad_func=loss_grad_func,
+        equality_funcs=equality_funcs,
         equality_grad_funcs=equality_grad_funcs,
         inequality_funcs=inequality_funcs,
         inequality_grad_funcs=inequality_grad_funcs,
@@ -166,7 +167,7 @@ class ExternalOptimizerInterface(object):
     session.run(self._var_updates,
                 feed_dict=dict(zip(self._update_placeholders, var_vals)))
 
-  def _minimize(self, initial_val, loss_func, loss_grad_func, equality_funcs,
+  def _minimize(self, initial_val, loss_grad_func, equality_funcs,
                 equality_grad_funcs, inequality_funcs, inequality_grad_funcs,
                 step_callback, optimizer_kwargs):
     """Wrapper for a particular optimization algorithm implementation.
@@ -177,9 +178,8 @@ class ExternalOptimizerInterface(object):
 
     Args:
       initial_val: A NumPy vector of initial values.
-      loss_func: A function accepting a NumPy packed variable vector and
-        returning a loss value.
-      loss_grad_func: A function that computes the gradient of loss_func with
+      loss_grad_func: A function accepting a NumPy packed variable vector and
+        returning two outputs, a loss value and the gradient of that loss with
         respect to the packed variable vector.
       equality_funcs: A list of functions each of which specifies a scalar
         quantity that an optimizer should hold exactly zero.
@@ -209,9 +209,13 @@ class ExternalOptimizerInterface(object):
       flattened = [array_ops.reshape(tensor, [-1]) for tensor in tensors]
       return array_ops.concat(0, flattened)
 
-  def _make_eval_func(self, tensor, session, feed_dict, fetches,
+  def _make_eval_func(self, tensors, session, feed_dict, fetches,
                       callback=None):
-    """Construct a function that evaluates a `Tensor`."""
+    """Construct a function that evaluates a `Tensor` or list of `Tensor`s."""
+    if not isinstance(tensors, list):
+      tensors = [tensors]
+    num_tensors = len(tensors)
+
     def eval_func(x):
       """Function to evaluate a `Tensor`."""
       augmented_feed_dict = {
@@ -219,15 +223,15 @@ class ExternalOptimizerInterface(object):
           for var, packing_slice in zip(self._vars, self._packing_slices)
       }
       augmented_feed_dict.update(feed_dict)
-      augmented_fetches = [tensor] + fetches
+      augmented_fetches = tensors + fetches
 
       augmented_fetch_vals = session.run(
           augmented_fetches, feed_dict=augmented_feed_dict)
 
       if callable(callback):
-        callback(*augmented_fetch_vals[1:])
+        callback(*augmented_fetch_vals[num_tensors:])
 
-      return augmented_fetch_vals[0]
+      return augmented_fetch_vals[:num_tensors]
 
     return eval_func
 
@@ -284,12 +288,13 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
 
   _DEFAULT_METHOD = 'L-BFGS-B'
 
-  def _minimize(self, initial_val, loss_func, loss_grad_func, equality_funcs,
+  def _minimize(self, initial_val, loss_grad_func, equality_funcs,
                 equality_grad_funcs, inequality_funcs, inequality_grad_funcs,
                 step_callback, optimizer_kwargs):
-    def grad_func_wrapper(x):
+    def loss_grad_func_wrapper(x):
       # SciPy's L-BFGS-B Fortran implementation requires gradients as doubles.
-      return loss_grad_func(x).astype('float64')
+      loss, gradient = loss_grad_func(x)
+      return loss, gradient.astype('float64')
 
     method = optimizer_kwargs.pop('method', self._DEFAULT_METHOD)
 
@@ -299,9 +304,9 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
     for func, grad_func in zip(inequality_funcs, inequality_grad_funcs):
       constraints.append({'type': 'ineq', 'fun': func, 'jac': grad_func})
 
-    minimize_args = [loss_func, initial_val]
+    minimize_args = [loss_grad_func_wrapper, initial_val]
     minimize_kwargs = {
-        'jac': grad_func_wrapper,
+        'jac': True,
         'callback': step_callback,
         'method': method,
         'constraints': constraints,
@@ -313,7 +318,15 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
       del minimize_kwargs['callback']
 
     import scipy.optimize  # pylint: disable=g-import-not-at-top
-    return scipy.optimize.minimize(*minimize_args, **minimize_kwargs)['x']
+    result = scipy.optimize.minimize(*minimize_args, **minimize_kwargs)
+    logging.info('Optimization terminated with:\n'
+                 '  Message: %s\n'
+                 '  Objective function value: %f\n'
+                 '  Number of iterations: %d\n'
+                 '  Number of functions evaluations: %d',
+                 result.message, result.fun, result.nit, result.nfev)
+
+    return result['x']
 
 
 def _accumulate(list_):
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer_test.py b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
index 6226f22eae2..95d27d0fe9c 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
@@ -28,36 +28,22 @@ try:
 except ImportError:
   import builtins
 
-try:
-  import mock
-except ImportError:
-  try:
-    import unittest.mock as mock
-  except ImportError:
-    # At the moment TensorFlow does not have access to mock when in Python 2.7
-    # mode, although mock is part of the standard Python 3 library. If mock is
-    # not available, indicate this by assigning None to it.
-    mock = None
-# pylint: enable=g-import-not-at-top,unused-import
-
 
 class MockOptimizerInterface(tf.contrib.opt.ExternalOptimizerInterface):
 
   NUM_STEP_CALLS = 5
   NUM_LOSS_CALLS = 2
-  NUM_GRAD_CALLS = 3
 
-  def _minimize(self, initial_val, loss_func, loss_grad_func, step_callback,
+  def _minimize(self, initial_val, loss_grad_func, step_callback,
                 optimizer_kwargs, **unused_kwargs):
     """Minimize (x - x0)**2 / 2 with respect to x."""
     for _ in range(self.NUM_LOSS_CALLS):
-      loss_func(initial_val)
-    for _ in range(self.NUM_GRAD_CALLS - 1):
       loss_grad_func(initial_val)
     for _ in range(self.NUM_STEP_CALLS):
       step_callback(initial_val)
 
-    return initial_val - loss_grad_func(initial_val)
+    _, grad = loss_grad_func(initial_val)
+    return initial_val - grad
 
 
 class TestCase(tf.test.TestCase):
@@ -72,30 +58,6 @@ class TestCase(tf.test.TestCase):
 
     super(TestCase, self).assertAllClose(array1, array2, rtol=1e-5, atol=1e-5)
 
-  def mock_import(self, module_name):
-    """Causes importing a specific module to return a mock.MagicMock instance.
-
-    Usage:
-      with mock_import('scipy'):
-        import scipy  # scipy is a MagicMock.
-        x = scipy.blah()[7]  # x is also a MagicMock.
-
-    Args:
-      module_name: Name of module that should be mocked.
-
-    Returns:
-      A context manager for use in a with statement.
-    """
-    orig_import = __import__
-    mocked_module = mock.MagicMock()
-
-    def import_mock(name, *args, **kwargs):
-      if name == module_name:
-        return mocked_module
-      return orig_import(name, *args, **kwargs)
-
-    return mock.patch.object(builtins, '__import__', side_effect=import_mock)
-
 
 class ExternalOptimizerInterfaceTest(TestCase):
 
@@ -123,11 +85,6 @@ class ExternalOptimizerInterfaceTest(TestCase):
       self.assertAllClose(np.arange(6).reshape(2, 3) + 3, sess.run(matrix))
 
   def test_callbacks(self):
-    if mock is None:
-      # This test requires mock. See comment in imports section at top.
-      tf.logging.warning('This test requires mock and will not be run')
-      return
-
     vector_val = np.array([7., -2.], dtype=np.float32)
     vector = tf.Variable(vector_val, 'vector')
 
@@ -146,21 +103,17 @@ class ExternalOptimizerInterfaceTest(TestCase):
 
       extra_fetches = [loss]
 
-      step_callback = mock.Mock()
-      loss_callback = mock.Mock()
-      grad_callback = mock.Mock()
+      step_callback = tf.test.mock.Mock()
+      loss_callback = tf.test.mock.Mock()
 
       optimizer.minimize(
           sess, fetches=extra_fetches, loss_callback=loss_callback,
-          grad_callback=grad_callback, step_callback=step_callback)
+          step_callback=step_callback)
 
-      call = mock.call(loss_val)
+      call = tf.test.mock.call(loss_val)
       loss_calls = [call] * MockOptimizerInterface.NUM_LOSS_CALLS
       loss_callback.assert_has_calls(loss_calls)
 
-      grad_calls = [call] * MockOptimizerInterface.NUM_GRAD_CALLS
-      grad_callback.assert_has_calls(grad_calls)
-
       args, _ = step_callback.call_args
       self.assertAllClose(initial_vector_val, args[0])
 
@@ -168,52 +121,35 @@ class ExternalOptimizerInterfaceTest(TestCase):
 class ScipyOptimizerInterfaceTest(TestCase):
 
   def test_unconstrained(self):
-    if mock is None:
-      # This test requires mock. See comment in imports section at top.
-      tf.logging.warning('This test requires mock and will not be run')
-      return
 
-    vector_initial_value = [7., 7.]
-    vector = tf.Variable(vector_initial_value, 'vector')
+    def objective(x):
+      """Rosenbrock function. (Carl Edward Rasmussen, 2001-07-21).
 
-    # Make norm as small as possible.
-    loss = tf.reduce_sum(tf.square(vector))
+      f(x) = sum_{i=1:D-1} 100*(x(i+1) - x(i)^2)^2 + (1-x(i))^2
 
-    optimizer = tf.contrib.opt.ScipyOptimizerInterface(loss)
+      Args:
+        x: a Variable
+      Returns:
+        f: a tensor (objective value)
+      """
+
+      d = tf.size(x)
+      s = tf.add(100 * tf.square(tf.sub(tf.slice(x, [1], [d - 1]),
+                                        tf.square(tf.slice(x, [0], [d - 1])))),
+                 tf.square(tf.sub(1.0, tf.slice(x, [0], [d - 1]))))
+      return tf.reduce_sum(s)
+
+    dimension = 5
+    x = tf.Variable(tf.zeros(dimension))
+    optimizer = tf.contrib.opt.ScipyOptimizerInterface(objective(x))
 
     with self.test_session() as sess:
       sess.run(tf.initialize_all_variables())
+      optimizer.minimize(sess)
 
-      with self.mock_import('scipy.optimize'):
-        import scipy.optimize  # pylint: disable=g-import-not-at-top
-        # scipy.optimize is now a mock.MagicMock.
-        optimized_vector = np.array([1.23, -0.1])
-        scipy.optimize.minimize.return_value = {'x': optimized_vector}
-        optimizer.minimize(sess)
-
-        self.assertAllClose(optimized_vector, sess.run(vector))
-
-        self.assertEqual(1, len(scipy.optimize.minimize.mock_calls))
-        call_signature = scipy.optimize.minimize.mock_calls[0]
-
-        args = call_signature[1]
-        self.assertEqual(2, len(args))
-        self.assertTrue(callable(args[0]))
-        self.assertAllClose(vector_initial_value, args[1])
-
-        kwargs = call_signature[2]
-        self.assertEqual(4, len(kwargs))
-        self.assertEqual('L-BFGS-B', kwargs['method'])
-        self.assertTrue(callable(kwargs['jac']))
-        self.assertTrue(callable(kwargs['callback']))
-        self.assertEqual([], kwargs['constraints'])
+      self.assertAllClose(np.ones(dimension), sess.run(x))
 
   def test_nonlinear_programming(self):
-    if mock is None:
-      # This test requires mock. See comment in imports section at top.
-      tf.logging.warning('This test requires mock and will not be run')
-      return
-
     vector_initial_value = [7., 7.]
     vector = tf.Variable(vector_initial_value, 'vector')
 
@@ -230,46 +166,8 @@ class ScipyOptimizerInterfaceTest(TestCase):
 
     with self.test_session() as sess:
       sess.run(tf.initialize_all_variables())
-
-      with self.mock_import('scipy.optimize'):
-        import scipy.optimize  # pylint: disable=g-import-not-at-top
-        # scipy.optimize is now a mock.MagicMock.
-        optimized_vector = np.array([1.23, -0.1])
-        scipy.optimize.minimize.return_value = {'x': optimized_vector}
-
-        optimizer.minimize(sess)
-
-        self.assertAllClose(optimized_vector, sess.run(vector))
-
-        self.assertEqual(1, len(scipy.optimize.minimize.mock_calls))
-        call_signature = scipy.optimize.minimize.mock_calls[0]
-
-        args = call_signature[1]
-        self.assertEqual(2, len(args))
-        self.assertTrue(callable(args[0]))
-        self.assertAllClose(vector_initial_value, args[1])
-
-        kwargs = call_signature[2]
-        self.assertEqual(3, len(kwargs))
-        self.assertEqual('SLSQP', kwargs['method'])
-        self.assertTrue(callable(kwargs['jac']))
-        # No callback keyword arg since SLSQP doesn't support it.
-
-        constraints = kwargs['constraints']
-        self.assertEqual(2, len(constraints))
-
-        eq_constraint = constraints[0]
-        self.assertEqual(3, len(eq_constraint))
-        self.assertEqual('eq', eq_constraint['type'])
-        self.assertTrue(callable(eq_constraint['fun']))
-        self.assertTrue(callable(eq_constraint['jac']))
-
-        ineq_constraint = constraints[1]
-        self.assertEqual(3, len(ineq_constraint))
-        self.assertEqual('ineq', ineq_constraint['type'])
-        self.assertTrue(callable(ineq_constraint['fun']))
-        self.assertTrue(callable(ineq_constraint['jac']))
-
+      optimizer.minimize(sess)
+      self.assertAllClose(np.ones(2), sess.run(vector))
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/quantization/kernels/quantization_utils.h b/tensorflow/contrib/quantization/kernels/quantization_utils.h
index c9a3c777977..45fda79ce50 100644
--- a/tensorflow/contrib/quantization/kernels/quantization_utils.h
+++ b/tensorflow/contrib/quantization/kernels/quantization_utils.h
@@ -25,7 +25,7 @@ limitations under the License.
 // to avoid a dependency on floating-point hardware.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "external/gemmlowp/public/gemmlowp.h"
+#include "public/gemmlowp.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
diff --git a/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc b/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc
index 647e68ea121..b25bff45a11 100644
--- a/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc
+++ b/tensorflow/contrib/quantization/kernels/quantized_conv_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
-#include "external/gemmlowp/public/gemmlowp.h"
+#include "public/gemmlowp.h"
 #include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
 #include "tensorflow/contrib/quantization/kernels/reference_gemm.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/contrib/quantization/kernels/quantized_matmul_op.cc b/tensorflow/contrib/quantization/kernels/quantized_matmul_op.cc
index 21abce932a1..18de2d1d97f 100644
--- a/tensorflow/contrib/quantization/kernels/quantized_matmul_op.cc
+++ b/tensorflow/contrib/quantization/kernels/quantized_matmul_op.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // Implements a quantized eight-bit version of the matmul operation.
 
-#include "external/gemmlowp/public/gemmlowp.h"
+#include "public/gemmlowp.h"
 #include "tensorflow/contrib/quantization/kernels/quantization_utils.h"
 #include "tensorflow/contrib/quantization/kernels/reference_gemm.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/contrib/quantization/kernels/quantized_pooling_ops.cc b/tensorflow/contrib/quantization/kernels/quantized_pooling_ops.cc
index c078de7ab18..33a12c47466 100644
--- a/tensorflow/contrib/quantization/kernels/quantized_pooling_ops.cc
+++ b/tensorflow/contrib/quantization/kernels/quantized_pooling_ops.cc
@@ -29,11 +29,6 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#if GOOGLE_CUDA
-#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
-#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
-#endif  // GOOGLE_CUDA
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
diff --git a/tensorflow/contrib/quantization/ops/array_ops.cc b/tensorflow/contrib/quantization/ops/array_ops.cc
index 35d0e7f4c9e..e1cf3ded93f 100644
--- a/tensorflow/contrib/quantization/ops/array_ops.cc
+++ b/tensorflow/contrib/quantization/ops/array_ops.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
 namespace tensorflow {
 
-// --------------------------------------------------------------------------
+using shape_inference::InferenceContext;
+using shape_inference::Shape;
 
 REGISTER_OP("QuantizeV2")
     .Input("input: float")
@@ -28,6 +31,15 @@ REGISTER_OP("QuantizeV2")
     .Output("output_max: float")
     .Attr("T: quantizedtype")
     .Attr("mode: {'MIN_COMBINED', 'MIN_FIRST'} = 'MIN_COMBINED'")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
     .Doc(R"doc(
 Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 
@@ -96,6 +108,13 @@ REGISTER_OP("Dequantize")
     .Output("output: float")
     .Attr("T: quantizedtype")
     .Attr("mode: {'MIN_COMBINED', 'MIN_FIRST'} = 'MIN_COMBINED'")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Dequantize the 'input' tensor into a float Tensor.
 
diff --git a/tensorflow/contrib/quantization/ops/math_ops.cc b/tensorflow/contrib/quantization/ops/math_ops.cc
index 6bc408531aa..ed0930c2d64 100644
--- a/tensorflow/contrib/quantization/ops/math_ops.cc
+++ b/tensorflow/contrib/quantization/ops/math_ops.cc
@@ -80,6 +80,15 @@ REGISTER_OP("QuantizeDownAndShrinkRange")
     .Output("output_max: float")
     .Attr("Tinput: quantizedtype")
     .Attr("out_type: quantizedtype")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
     .Doc(R"doc(
 Convert the quantized 'input' tensor into a lower-precision 'output', using the
 actual distribution of the values to maximize the usage of the lower bit depth
diff --git a/tensorflow/contrib/quantization/ops/nn_ops.cc b/tensorflow/contrib/quantization/ops/nn_ops.cc
index fd12d155db5..c33f318c6e7 100644
--- a/tensorflow/contrib/quantization/ops/nn_ops.cc
+++ b/tensorflow/contrib/quantization/ops/nn_ops.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 namespace tensorflow {
 
+using shape_inference::Dimension;
 using shape_inference::InferenceContext;
 using shape_inference::Shape;
 
@@ -73,6 +74,17 @@ REGISTER_OP("QuantizedBiasAdd")
     .Attr("T1: quantizedtype")
     .Attr("T2: quantizedtype")
     .Attr("out_type: quantizedtype")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::BiasAddShape(c));
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
     .Doc(R"doc(
 Adds Tensor 'bias' to Tensor 'input' for Quantized types.
 
@@ -103,6 +115,17 @@ REGISTER_OP("QuantizedConv2D")
     .Attr("out_type: quantizedtype = DT_QINT32")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes a 2D convolution given quantized 4D input and filter tensors.
 The inputs are quantized tensors where the lowest value represents the real
@@ -134,6 +157,15 @@ REGISTER_OP("QuantizedMaxPool")
     .Attr("ksize: list(int)")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::MaxPoolShape(c));
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
     .Doc(R"doc(
 Produces the max pool of the input tensor for quantized types.
 
@@ -159,6 +191,15 @@ REGISTER_OP("QuantizedRelu")
     .Output("max_activations: float")
     .Attr("Tinput: quantizedtype")
     .Attr("out_type: quantizedtype = DT_QUINT8")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes Quantized Rectified Linear: `max(features, 0)`
 
@@ -179,6 +220,15 @@ REGISTER_OP("QuantizedRelu6")
     .Output("max_activations: float")
     .Attr("Tinput: quantizedtype")
     .Attr("out_type: quantizedtype = DT_QUINT8")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
 
@@ -200,6 +250,15 @@ REGISTER_OP("QuantizedReluX")
     .Output("max_activations: float")
     .Attr("Tinput: quantizedtype")
     .Attr("out_type: quantizedtype = DT_QUINT8")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 
@@ -234,6 +293,25 @@ REGISTER_OP("QuantizedBatchNormWithGlobalNormalization")
     .Attr("out_type: quantizedtype")
     .Attr("variance_epsilon: float")
     .Attr("scale_after_normalization: bool")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+
+      const Dimension* last_dim = c->Dim(input, 3);
+      for (int i = 1; i < 5; ++i) {  // covers m, v, beta, gamma
+        const Shape* vec;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i * 3), 1, &vec));
+        TF_RETURN_IF_ERROR(c->Merge(last_dim, c->Dim(vec, 0), &last_dim));
+      }
+
+      const Shape* out;
+      TF_RETURN_IF_ERROR(c->ReplaceDim(input, 3, last_dim, &out));
+      c->set_output(0, out);
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+
+      return Status::OK();
+    })
     .Doc(R"doc(
 Quantized Batch normalization.
 
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 9e819ba62fd..f69c656c68b 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -9,11 +9,16 @@ exports_files(["LICENSE"])
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 
 py_library(
     name = "rnn_py",
     srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    data = [
+        ":python/ops/_lstm_ops.so",
+    ],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
 )
 
 cuda_py_tests(
@@ -27,6 +32,33 @@ cuda_py_tests(
     ],
 )
 
+cuda_py_tests(
+    name = "lstm_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/lstm_ops_test.py"],
+    additional_deps = [
+        ":rnn_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+tf_custom_op_library(
+    name = "python/ops/_lstm_ops.so",
+    srcs = [
+        "kernels/lstm_ops.cc",
+        "kernels/lstm_ops.h",
+        "ops/lstm_ops.cc",
+    ],
+    gpu_srcs = [
+        "kernels/lstm_ops_gpu.cu.cc",
+        "kernels/lstm_ops.h",
+    ],
+    deps = [
+        "//tensorflow/core/kernels:eigen_helpers",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/rnn/__init__.py b/tensorflow/contrib/rnn/__init__.py
index 2193f644849..8ead5f00045 100644
--- a/tensorflow/contrib/rnn/__init__.py
+++ b/tensorflow/contrib/rnn/__init__.py
@@ -12,14 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Ops for representing statistical distributions.
+"""Additional RNN operations and cells.
 
-## This package provides classes for statistical distributions.
+## This package provides additional contributed RNNCells.
 
+### Fused RNNCells
+@@LSTMFusedCell
+
+### LSTM-like cells
+@@CoupledInputForgetGateLSTMCell
+@@TimeFreqLSTMCell
+@@GridLSTMCell
+
+### RNNCell wrappers
+@@AttentionCellWrapper
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import, line-too-long
+from tensorflow.contrib.rnn.python.ops.lstm_ops import *
 from tensorflow.contrib.rnn.python.ops.rnn_cell import *
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.cc b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
new file mode 100644
index 00000000000..74bede713c1
--- /dev/null
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
@@ -0,0 +1,1053 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "tensorflow/contrib/rnn/kernels/lstm_ops.h"
+
+#include <memory>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+#if GOOGLE_CUDA
+
+namespace {
+template <typename T>
+perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
+  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
+  perftools::gputools::DeviceMemory<T> typed(wrapped);
+  return typed;
+}
+}  // namespace
+
+#endif  // GOOGLE_CUDA
+
+namespace functor {
+template <typename T>
+void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx,
+                                     perftools::gputools::Stream* stream,
+                                     bool transa, bool transb, uint64 m,
+                                     uint64 n, uint64 k, T alpha, const T* a,
+                                     int lda, const T* b, int ldb, T beta, T* c,
+                                     int ldc) {
+#if GOOGLE_CUDA
+  perftools::gputools::blas::Transpose trans[] = {
+      perftools::gputools::blas::Transpose::kNoTranspose,
+      perftools::gputools::blas::Transpose::kTranspose};
+
+  auto a_ptr = AsDeviceMemory(a);
+  auto b_ptr = AsDeviceMemory(b);
+  auto c_ptr = AsDeviceMemory(c);
+
+  bool blas_launch_status =
+      stream
+          ->ThenBlasGemm(trans[transa], trans[transb], m, n, k, alpha, a_ptr,
+                         lda, b_ptr, ldb, beta, &c_ptr, ldc)
+          .ok();
+  OP_REQUIRES(ctx, blas_launch_status, errors::Aborted("CuBlasGemm failed!"));
+#else
+  ctx->SetStatus(errors::InvalidArgument("CuBlasGemm needs CUDA."));
+#endif
+}
+
+template struct TensorCuBlasGemm<float>;
+// template struct TensorCuBlasGemm<double>;
+}  // end namespace functor
+
+template <typename Device, typename T, bool USE_CUBLAS>
+class LSTMFusedCellOp : public OpKernel {
+ public:
+  explicit LSTMFusedCellOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("forget_bias", &forget_bias_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("cell_clip", &cell_clip_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_peephole", &use_peephole_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* x_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("x", &x_tensor));
+
+    const Tensor* cs_prev_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("cs_prev", &cs_prev_tensor));
+
+    const Tensor* h_prev_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("h_prev", &h_prev_tensor));
+
+    const Tensor* w_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor));
+
+    const Tensor* wci_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor));
+
+    const Tensor* wcf_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("wcf", &wcf_tensor));
+
+    const Tensor* wco_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("wco", &wco_tensor));
+
+    const Tensor* b_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("b", &b_tensor));
+
+    const int64 batch_size = x_tensor->dim_size(0);
+    const int64 input_size = x_tensor->dim_size(1);
+    const int64 cell_size = cs_prev_tensor->dim_size(1);
+
+    // Sanity checks for our input shapes.
+    OP_REQUIRES(ctx, cs_prev_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument("cs_prev.dims(0) != batch_size: ",
+                                        cs_prev_tensor->dim_size(0), " vs. ",
+                                        batch_size));
+    OP_REQUIRES(ctx, cs_prev_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument("cs_prev.dims(1) != cell_size: ",
+                                        cs_prev_tensor->dim_size(1), " vs. ",
+                                        cell_size));
+
+    OP_REQUIRES(ctx, h_prev_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument("h_prev.dims(0) != batch_size: ",
+                                        h_prev_tensor->dim_size(0), " vs. ",
+                                        batch_size));
+    OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
+                                        h_prev_tensor->dim_size(1), " vs. ",
+                                        cell_size));
+
+    OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size,
+                errors::InvalidArgument(
+                    "w.dim_size(0) != input_size + cell_size: ",
+                    w_tensor->dim_size(0), " vs. ", input_size + cell_size));
+    OP_REQUIRES(
+        ctx, w_tensor->dim_size(1) == cell_size * 4,
+        errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ",
+                                w_tensor->dim_size(1), " vs. ", cell_size * 4));
+
+    OP_REQUIRES(
+        ctx, b_tensor->dim_size(0) == cell_size * 4,
+        errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ",
+                                b_tensor->dim_size(0), " vs. ", cell_size * 4));
+
+    // Allocate our output tensors.
+    Tensor* i_tensor = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("i", TensorShape({batch_size, cell_size}),
+                                  &i_tensor));
+
+    Tensor* cs_tensor = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("cs", TensorShape({batch_size, cell_size}),
+                                  &cs_tensor));
+
+    Tensor* f_tensor = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("f", TensorShape({batch_size, cell_size}),
+                                  &f_tensor));
+
+    Tensor* o_tensor = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("o", TensorShape({batch_size, cell_size}),
+                                  &o_tensor));
+
+    Tensor* ci_tensor = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("ci", TensorShape({batch_size, cell_size}),
+                                  &ci_tensor));
+
+    Tensor* co_tensor = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("co", TensorShape({batch_size, cell_size}),
+                                  &co_tensor));
+
+    Tensor* h_tensor = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("h", TensorShape({batch_size, cell_size}),
+                                  &h_tensor));
+
+    // Allocate our temp tensors.
+    Tensor xh_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                            DataTypeToEnum<T>::v(),
+                            TensorShape({batch_size, input_size + cell_size}),
+                            &xh_tensor));
+
+    Tensor icfo_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                      TensorShape({batch_size, cell_size * 4}),
+                                      &icfo_tensor));
+
+    const Device& device = ctx->eigen_device<Device>();
+    perftools::gputools::Stream* stream =
+        std::is_same<Device, GPUDevice>::value
+            ? ctx->op_device_context()->stream()
+            : nullptr;
+
+    functor::LSTMFusedCellFprop<Device, T, USE_CUBLAS>(batch_size, input_size,
+                                                       cell_size)(
+        ctx, stream, device, forget_bias_, cell_clip_, use_peephole_,
+        x_tensor->matrix<T>(), cs_prev_tensor->matrix<T>(),
+        h_prev_tensor->matrix<T>(), w_tensor->matrix<T>(), wci_tensor->vec<T>(),
+        wcf_tensor->vec<T>(), wco_tensor->vec<T>(), b_tensor->vec<T>(),
+        xh_tensor.matrix<T>(), i_tensor->matrix<T>(), cs_tensor->matrix<T>(),
+        f_tensor->matrix<T>(), o_tensor->matrix<T>(), ci_tensor->matrix<T>(),
+        co_tensor->matrix<T>(), icfo_tensor.matrix<T>(), h_tensor->matrix<T>());
+  }
+
+ private:
+  float forget_bias_;
+  float cell_clip_;
+  bool use_peephole_;
+};
+
+#define REGISTER_KERNEL(T)                                             \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("LSTMFusedCell").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      LSTMFusedCellOp<CPUDevice, T, false>);
+REGISTER_KERNEL(float);
+// REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                \
+  template <>                                                              \
+  void LSTMFusedCellFprop<GPUDevice, T, true>::operator()(                 \
+      OpKernelContext* ctx, perftools::gputools::Stream* stream,           \
+      const GPUDevice& d, const T forget_bias, const T cell_clip,          \
+      bool use_peephole, typename TTypes<T>::ConstMatrix x,                \
+      typename TTypes<T>::ConstMatrix cs_prev,                             \
+      typename TTypes<T>::ConstMatrix h_prev,                              \
+      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci, \
+      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,  \
+      typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,       \
+      typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,         \
+      typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,          \
+      typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,        \
+      typename TTypes<T>::Matrix icfo, typename TTypes<T>::Matrix h);      \
+                                                                           \
+  extern template struct LSTMFusedCellFprop<GPUDevice, T, true>;
+
+DECLARE_GPU_SPEC(float);
+// DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // end namespace functor
+
+#define REGISTER_GPU_KERNEL(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("LSTMFusedCell").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      LSTMFusedCellOp<GPUDevice, T, true>);
+
+REGISTER_GPU_KERNEL(float);
+// REGISTER_GPU_KERNEL(double);
+#undef REGISTER_GPU_KERNEL
+#endif  // GOOGLE_CUDA
+
+template <typename Device, typename T, bool USE_CUBLAS>
+class LSTMFusedCellGradOp : public OpKernel {
+ public:
+  explicit LSTMFusedCellGradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_peephole", &use_peephole_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* x_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("x", &x_tensor));
+
+    const Tensor* cs_prev_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("cs_prev", &cs_prev_tensor));
+
+    const Tensor* h_prev_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("h_prev", &h_prev_tensor));
+
+    const Tensor* w_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor));
+
+    const Tensor* wci_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor));
+
+    const Tensor* wcf_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("wcf", &wcf_tensor));
+
+    const Tensor* wco_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("wco", &wco_tensor));
+
+    const Tensor* b_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("b", &b_tensor));
+
+    const Tensor* i_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("i", &i_tensor));
+
+    const Tensor* cs_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("cs", &cs_tensor));
+
+    const Tensor* f_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("f", &f_tensor));
+
+    const Tensor* o_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("o", &o_tensor));
+
+    const Tensor* ci_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("ci", &ci_tensor));
+
+    const Tensor* co_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("co", &co_tensor));
+
+    const Tensor* cs_grad_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("cs_grad", &cs_grad_tensor));
+
+    const Tensor* h_grad_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("h_grad", &h_grad_tensor));
+
+    const int64 batch_size = x_tensor->dim_size(0);
+    const int64 input_size = x_tensor->dim_size(1);
+    const int64 cell_size = cs_prev_tensor->dim_size(1);
+
+    // Sanity checks for our input shapes.
+    OP_REQUIRES(ctx, cs_prev_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument("cs_prev.dims(0) != batch_size: ",
+                                        cs_prev_tensor->dim_size(0), " vs. ",
+                                        batch_size));
+    OP_REQUIRES(ctx, cs_prev_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument("cs_prev.dims(1) != cell_size: ",
+                                        cs_prev_tensor->dim_size(1), " vs. ",
+                                        cell_size));
+
+    OP_REQUIRES(ctx, h_prev_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument("h_prev.dims(0) != batch_size: ",
+                                        h_prev_tensor->dim_size(0), " vs. ",
+                                        batch_size));
+    OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
+                                        h_prev_tensor->dim_size(1), " vs. ",
+                                        cell_size));
+
+    OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size,
+                errors::InvalidArgument(
+                    "w.dim_size(0) != input_size + cell_size: ",
+                    w_tensor->dim_size(0), " vs. ", input_size + cell_size));
+    OP_REQUIRES(
+        ctx, w_tensor->dim_size(1) == cell_size * 4,
+        errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ",
+                                w_tensor->dim_size(1), " vs. ", cell_size * 4));
+
+    OP_REQUIRES(
+        ctx, b_tensor->dim_size(0) == cell_size * 4,
+        errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ",
+                                b_tensor->dim_size(0), " vs. ", cell_size * 4));
+
+    OP_REQUIRES(
+        ctx, i_tensor->dim_size(0) == batch_size,
+        errors::InvalidArgument("i.dim_size(0) != batch_size: ",
+                                i_tensor->dim_size(0), " vs. ", batch_size));
+    OP_REQUIRES(
+        ctx, i_tensor->dim_size(1) == cell_size,
+        errors::InvalidArgument("i.dim_size(1) != cell_size: ",
+                                i_tensor->dim_size(1), " vs. ", cell_size));
+
+    OP_REQUIRES(
+        ctx, cs_tensor->dim_size(0) == batch_size,
+        errors::InvalidArgument("cs.dim_size(0) != batch_size: ",
+                                cs_tensor->dim_size(0), " vs. ", batch_size));
+    OP_REQUIRES(
+        ctx, cs_tensor->dim_size(1) == cell_size,
+        errors::InvalidArgument("cs.dim_size(1) != cell_size: ",
+                                cs_tensor->dim_size(1), " vs. ", cell_size));
+
+    OP_REQUIRES(
+        ctx, f_tensor->dim_size(0) == batch_size,
+        errors::InvalidArgument("f.dim_size(0) != batch_size: ",
+                                f_tensor->dim_size(0), " vs. ", batch_size));
+    OP_REQUIRES(
+        ctx, f_tensor->dim_size(1) == cell_size,
+        errors::InvalidArgument("i.dim_size(1) != cell_size: ",
+                                f_tensor->dim_size(1), " vs. ", cell_size));
+
+    OP_REQUIRES(
+        ctx, o_tensor->dim_size(0) == batch_size,
+        errors::InvalidArgument("o.dim_size(0) != batch_size: ",
+                                o_tensor->dim_size(0), " vs. ", batch_size));
+    OP_REQUIRES(
+        ctx, o_tensor->dim_size(1) == cell_size,
+        errors::InvalidArgument("o.dim_size(1) != cell_size: ",
+                                o_tensor->dim_size(1), " vs. ", cell_size));
+
+    OP_REQUIRES(
+        ctx, ci_tensor->dim_size(0) == batch_size,
+        errors::InvalidArgument("ci.dim_size(0) != batch_size: ",
+                                ci_tensor->dim_size(0), " vs. ", batch_size));
+    OP_REQUIRES(
+        ctx, ci_tensor->dim_size(1) == cell_size,
+        errors::InvalidArgument("ci.dim_size(1) != cell_size: ",
+                                ci_tensor->dim_size(1), " vs. ", cell_size));
+
+    OP_REQUIRES(
+        ctx, co_tensor->dim_size(0) == batch_size,
+        errors::InvalidArgument("co.dim_size(0) != batch_size: ",
+                                co_tensor->dim_size(0), " vs. ", batch_size));
+    OP_REQUIRES(
+        ctx, co_tensor->dim_size(1) == cell_size,
+        errors::InvalidArgument("co.dim_size(1) != cell_size: ",
+                                co_tensor->dim_size(1), " vs. ", cell_size));
+
+    OP_REQUIRES(ctx, cs_grad_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "cs_grad_tensor.dims(0) != batch_size: ",
+                    cs_grad_tensor->dim_size(0), " vs. ", batch_size));
+    OP_REQUIRES(ctx, cs_grad_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument("cs_grad_tensor.dims(1) != cell_size: ",
+                                        cs_grad_tensor->dim_size(1), " vs. ",
+                                        cell_size));
+
+    OP_REQUIRES(ctx, h_grad_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument("h_grad_tensor.dims(0) != batch_size: ",
+                                        h_grad_tensor->dim_size(0), " vs. ",
+                                        batch_size));
+    OP_REQUIRES(ctx, h_grad_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument("h_grad_tensor.dims(1) != cell_size: ",
+                                        h_grad_tensor->dim_size(1), " vs. ",
+                                        cell_size));
+
+    // Allocate our output tensors.
+    Tensor* cs_prev_grad_tensor = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output("cs_prev_grad",
+                                        TensorShape({batch_size, cell_size}),
+                                        &cs_prev_grad_tensor));
+
+    Tensor* dicfo_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                            "dicfo", TensorShape({batch_size, cell_size * 4}),
+                            &dicfo_tensor));
+
+    Tensor* wci_grad_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("wci_grad", wci_tensor->shape(),
+                                             &wci_grad_tensor));
+
+    Tensor* wcf_grad_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("wcf_grad", wcf_tensor->shape(),
+                                             &wcf_grad_tensor));
+
+    Tensor* wco_grad_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("wco_grad", wco_tensor->shape(),
+                                             &wco_grad_tensor));
+
+    // Allocate our temp tensors.
+    Tensor do_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                           TensorShape({batch_size, cell_size}),
+                                           &do_tensor));
+
+    Tensor dcs_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                           TensorShape({batch_size, cell_size}),
+                                           &dcs_tensor));
+
+    Tensor dci_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                           TensorShape({batch_size, cell_size}),
+                                           &dci_tensor));
+
+    Tensor df_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                           TensorShape({batch_size, cell_size}),
+                                           &df_tensor));
+
+    Tensor di_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                           TensorShape({batch_size, cell_size}),
+                                           &di_tensor));
+
+    const Device& device = ctx->eigen_device<Device>();
+    perftools::gputools::Stream* stream =
+        std::is_same<Device, GPUDevice>::value
+            ? ctx->op_device_context()->stream()
+            : nullptr;
+
+    functor::TensorZero<Device, T>()(device, wci_grad_tensor->flat<float>());
+    functor::TensorZero<Device, T>()(device, wcf_grad_tensor->flat<float>());
+    functor::TensorZero<Device, T>()(device, wco_grad_tensor->flat<float>());
+
+    functor::LSTMFusedCellBprop<Device, T, USE_CUBLAS>(batch_size, input_size,
+                                                       cell_size)(
+        ctx, stream, device, use_peephole_, x_tensor->matrix<T>(),
+        cs_prev_tensor->matrix<T>(), h_prev_tensor->matrix<T>(),
+        w_tensor->matrix<T>(), wci_tensor->vec<T>(), wcf_tensor->vec<T>(),
+        wco_tensor->vec<T>(), b_tensor->vec<T>(), i_tensor->matrix<T>(),
+        cs_tensor->matrix<T>(), f_tensor->matrix<T>(), o_tensor->matrix<T>(),
+        ci_tensor->matrix<T>(), co_tensor->matrix<T>(),
+        cs_grad_tensor->matrix<T>(), h_grad_tensor->matrix<T>(),
+        do_tensor.matrix<T>(), dcs_tensor.matrix<T>(), dci_tensor.matrix<T>(),
+        df_tensor.matrix<T>(), di_tensor.matrix<T>(), dicfo_tensor->matrix<T>(),
+        cs_prev_grad_tensor->matrix<T>(), wci_grad_tensor->vec<T>(),
+        wcf_grad_tensor->vec<T>(), wco_grad_tensor->vec<T>());
+  }
+
+ protected:
+  bool use_peephole_;
+};
+
+#define REGISTER_KERNEL(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("LSTMFusedCellGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      LSTMFusedCellGradOp<CPUDevice, T, false>);
+REGISTER_KERNEL(float);
+// REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                   \
+  template <>                                                                 \
+  void LSTMFusedCellBprop<GPUDevice, T, true>::operator()(                    \
+      OpKernelContext* ctx, perftools::gputools::Stream* stream,              \
+      const GPUDevice& d, bool use_peephole,                                  \
+      typename TTypes<T>::ConstMatrix x,                                      \
+      typename TTypes<T>::ConstMatrix cs_prev,                                \
+      typename TTypes<T>::ConstMatrix h_prev,                                 \
+      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,    \
+      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,     \
+      typename TTypes<T>::ConstVec b, typename TTypes<T>::ConstMatrix i,      \
+      typename TTypes<T>::ConstMatrix cs, typename TTypes<T>::ConstMatrix f,  \
+      typename TTypes<T>::ConstMatrix o, typename TTypes<T>::ConstMatrix ci,  \
+      typename TTypes<T>::ConstMatrix co,                                     \
+      typename TTypes<T>::ConstMatrix cs_grad,                                \
+      typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_, \
+      typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,         \
+      typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,           \
+      typename TTypes<T>::Matrix dicfo,                                       \
+      typename TTypes<T>::Matrix cs_prev_grad,                                \
+      typename TTypes<T>::Vec wci_grad, typename TTypes<T>::Vec wcf_grad,     \
+      typename TTypes<T>::Vec wco_grad);                                      \
+                                                                              \
+  extern template struct LSTMFusedCellBprop<GPUDevice, T, true>;
+
+DECLARE_GPU_SPEC(float);
+// DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+#define REGISTER_GPU_KERNEL(T)                                             \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("LSTMFusedCellGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      LSTMFusedCellGradOp<GPUDevice, T, true>);
+
+REGISTER_GPU_KERNEL(float);
+// REGISTER_GPU_KERNEL(double);
+#undef REGISTER_GPU_KERNEL
+#endif  // GOOGLE_CUDA
+
+template <typename Device, typename T, bool USE_CUBLAS>
+class FusedLSTMOp : public OpKernel {
+ public:
+  explicit FusedLSTMOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_len", &max_len_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("forget_bias", &forget_bias_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("cell_clip", &cell_clip_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_peephole", &use_peephole_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* seq_len_max_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("seq_len_max", &seq_len_max_tensor));
+
+    OpInputList x_list;
+    OP_REQUIRES_OK(ctx, ctx->input_list("x", &x_list));
+    const int64 batch_size = x_list[0].dim_size(0);
+    const int64 input_size = x_list[0].dim_size(1);
+
+    const Tensor* cs_prev_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("cs_prev", &cs_prev_tensor));
+
+    const Tensor* h_prev_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("h_prev", &h_prev_tensor));
+
+    const Tensor* w_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor));
+
+    const Tensor* wci_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor));
+
+    const Tensor* wcf_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("wcf", &wcf_tensor));
+
+    const Tensor* wco_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("wco", &wco_tensor));
+
+    const Tensor* b_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("b", &b_tensor));
+    const int64 cell_size = b_tensor->dim_size(0) / 4;
+
+    OpOutputList i_list;
+    OP_REQUIRES_OK(ctx, ctx->output_list("i", &i_list));
+
+    OpOutputList cs_list;
+    OP_REQUIRES_OK(ctx, ctx->output_list("cs", &cs_list));
+
+    OpOutputList f_list;
+    OP_REQUIRES_OK(ctx, ctx->output_list("f", &f_list));
+
+    OpOutputList o_list;
+    OP_REQUIRES_OK(ctx, ctx->output_list("o", &o_list));
+
+    OpOutputList ci_list;
+    OP_REQUIRES_OK(ctx, ctx->output_list("ci", &ci_list));
+
+    OpOutputList co_list;
+    OP_REQUIRES_OK(ctx, ctx->output_list("co", &co_list));
+
+    OpOutputList h_list;
+    OP_REQUIRES_OK(ctx, ctx->output_list("h", &h_list));
+
+    TensorShape batch_cell_shape({batch_size, cell_size});
+    for (int64 t = 0; t < max_len_; ++t) {
+      Tensor* i_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, i_list.allocate(t, batch_cell_shape, &i_tensor));
+
+      Tensor* cs_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, cs_list.allocate(t, batch_cell_shape, &cs_tensor));
+
+      Tensor* f_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, f_list.allocate(t, batch_cell_shape, &f_tensor));
+
+      Tensor* o_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, o_list.allocate(t, batch_cell_shape, &o_tensor));
+
+      Tensor* ci_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, ci_list.allocate(t, batch_cell_shape, &ci_tensor));
+
+      Tensor* co_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, co_list.allocate(t, batch_cell_shape, &co_tensor));
+
+      Tensor* h_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, h_list.allocate(t, batch_cell_shape, &h_tensor));
+    }
+
+    Tensor xh_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                            DataTypeToEnum<T>::v(),
+                            TensorShape({batch_size, input_size + cell_size}),
+                            &xh_tensor));
+
+    Tensor icfo_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                      TensorShape({batch_size, cell_size * 4}),
+                                      &icfo_tensor));
+
+    const Device& device = ctx->eigen_device<Device>();
+    perftools::gputools::Stream* stream =
+        std::is_same<Device, GPUDevice>::value
+            ? ctx->op_device_context()->stream()
+            : nullptr;
+
+    const int64 seq_len_max = seq_len_max_tensor->scalar<int64>()();
+    for (int64 t = 0; t < seq_len_max; ++t) {
+      const Tensor& x_tensor = x_list[t];
+      const Tensor& cs_prev_tensor2 =
+          t == 0 ? *cs_prev_tensor : *cs_list[t - 1];
+      const Tensor& h_prev_tensor2 = t == 0 ? *h_prev_tensor : *h_list[t - 1];
+
+      Tensor* i_tensor = i_list[t];
+      Tensor* cs_tensor = cs_list[t];
+      Tensor* f_tensor = f_list[t];
+      Tensor* o_tensor = o_list[t];
+      Tensor* ci_tensor = ci_list[t];
+      Tensor* co_tensor = co_list[t];
+      Tensor* h_tensor = h_list[t];
+
+      functor::LSTMFusedCellFprop<Device, T, USE_CUBLAS>(batch_size, input_size,
+                                                         cell_size)(
+          ctx, stream, device, forget_bias_, cell_clip_, use_peephole_,
+          x_tensor.matrix<T>(), cs_prev_tensor2.matrix<T>(),
+          h_prev_tensor2.matrix<T>(), w_tensor->matrix<T>(),
+          wci_tensor->vec<T>(), wcf_tensor->vec<T>(), wco_tensor->vec<T>(),
+          b_tensor->vec<T>(), xh_tensor.matrix<T>(), i_tensor->matrix<T>(),
+          cs_tensor->matrix<T>(), f_tensor->matrix<T>(), o_tensor->matrix<T>(),
+          ci_tensor->matrix<T>(), co_tensor->matrix<T>(),
+          icfo_tensor.matrix<T>(), h_tensor->matrix<T>());
+    }
+
+    for (int64 t = seq_len_max; t < max_len_; ++t) {
+      Tensor* cs_tensor = cs_list[t];
+      Tensor* h_tensor = h_list[t];
+
+      functor::TensorZero<Device, T>()(device, cs_tensor->flat<float>());
+      functor::TensorZero<Device, T>()(device, h_tensor->flat<float>());
+    }
+  }
+
+ private:
+  int64 max_len_;
+  float forget_bias_;
+  float cell_clip_;
+  bool use_peephole_;
+};
+
+#define REGISTER_KERNEL(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("FusedLSTM").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      FusedLSTMOp<CPUDevice, T, false>);
+REGISTER_KERNEL(float);
+// REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                              \
+  template <>                                                            \
+  void TensorZero<GPUDevice, T>::operator()(const GPUDevice& d,          \
+                                            typename TTypes<T>::Flat t); \
+                                                                         \
+  extern template struct TensorZero<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(float);
+// DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // end namespace functor
+
+#define REGISTER_GPU_KERNEL(T)                           \
+  REGISTER_KERNEL_BUILDER(Name("FusedLSTM")              \
+                              .Device(DEVICE_GPU)        \
+                              .HostMemory("seq_len_max") \
+                              .TypeConstraint<T>("T"),   \
+                          FusedLSTMOp<GPUDevice, T, true>);
+
+REGISTER_GPU_KERNEL(float);
+// REGISTER_GPU_KERNEL(double);
+#undef REGISTER_GPU_KERNEL
+#endif  // GOOGLE_CUDA
+
+template <typename Device, typename T, bool USE_CUBLAS>
+class FusedLSTMGradOp : public OpKernel {
+ public:
+  explicit FusedLSTMGradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_len", &max_len_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_peephole", &use_peephole_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* seq_len_max_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("seq_len_max", &seq_len_max_tensor));
+
+    OpInputList x_list;
+    OP_REQUIRES_OK(ctx, ctx->input_list("x", &x_list));
+    const int64 batch_size = x_list[0].dim_size(0);
+    const int64 input_size = x_list[0].dim_size(1);
+
+    const Tensor* cs_prev_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("cs_prev", &cs_prev_tensor));
+
+    const Tensor* h_prev_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("h_prev", &h_prev_tensor));
+
+    const Tensor* w_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor));
+    const int64 cell_size = w_tensor->dim_size(1) / 4;
+    OP_REQUIRES(ctx, input_size + cell_size == w_tensor->dim_size(0),
+                errors::InvalidArgument("w matrix rows don't match: ",
+                                        input_size + cell_size, " vs. ",
+                                        w_tensor->dim_size(0)));
+
+    const Tensor* wci_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor));
+
+    const Tensor* wcf_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("wcf", &wcf_tensor));
+
+    const Tensor* wco_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("wco", &wco_tensor));
+
+    const Tensor* b_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("b", &b_tensor));
+    OP_REQUIRES(
+        ctx, cell_size == b_tensor->dim_size(0) / 4,
+        errors::InvalidArgument("w and b cell_size don't match: ", cell_size,
+                                " vs. ", b_tensor->dim_size(0)));
+
+    OpInputList i_list;
+    OP_REQUIRES_OK(ctx, ctx->input_list("i", &i_list));
+
+    OpInputList cs_list;
+    OP_REQUIRES_OK(ctx, ctx->input_list("cs", &cs_list));
+
+    OpInputList f_list;
+    OP_REQUIRES_OK(ctx, ctx->input_list("f", &f_list));
+
+    OpInputList o_list;
+    OP_REQUIRES_OK(ctx, ctx->input_list("o", &o_list));
+
+    OpInputList ci_list;
+    OP_REQUIRES_OK(ctx, ctx->input_list("ci", &ci_list));
+
+    OpInputList co_list;
+    OP_REQUIRES_OK(ctx, ctx->input_list("co", &co_list));
+
+    OpInputList h_list;
+    OP_REQUIRES_OK(ctx, ctx->input_list("h", &h_list));
+
+    OpInputList cs_grad_list;
+    OP_REQUIRES_OK(ctx, ctx->input_list("cs_grad", &cs_grad_list));
+
+    OpInputList h_grad_list;
+    OP_REQUIRES_OK(ctx, ctx->input_list("h_grad", &h_grad_list));
+
+    OpOutputList x_grad_list;
+    OP_REQUIRES_OK(ctx, ctx->output_list("x_grad", &x_grad_list));
+
+    Tensor* cs_prev_grad_tensor = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output("cs_prev_grad", cs_prev_tensor->shape(),
+                                        &cs_prev_grad_tensor));
+
+    Tensor* h_prev_grad_tensor = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output("h_prev_grad", h_prev_tensor->shape(),
+                                        &h_prev_grad_tensor));
+
+    Tensor* w_grad_tensor = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("w_grad", w_tensor->shape(), &w_grad_tensor));
+
+    Tensor* wci_grad_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("wci_grad", wci_tensor->shape(),
+                                             &wci_grad_tensor));
+
+    Tensor* wcf_grad_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("wcf_grad", wcf_tensor->shape(),
+                                             &wcf_grad_tensor));
+
+    Tensor* wco_grad_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("wco_grad", wco_tensor->shape(),
+                                             &wco_grad_tensor));
+
+    Tensor* b_grad_tensor = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("b_grad", b_tensor->shape(), &b_grad_tensor));
+
+    TensorShape batch_input_shape({batch_size, input_size});
+    TensorShape batch_cell_shape({batch_size, cell_size});
+    for (int64 t = 0; t < max_len_; ++t) {
+      Tensor* x_grad_tensor = nullptr;
+      OP_REQUIRES_OK(
+          ctx, x_grad_list.allocate(t, batch_input_shape, &x_grad_tensor));
+    }
+
+    Tensor xh_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                            DataTypeToEnum<T>::v(),
+                            TensorShape({batch_size, input_size + cell_size}),
+                            &xh_tensor));
+
+    Tensor xh_grad_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                           xh_tensor.shape(), &xh_grad_tensor));
+
+    Tensor do_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                           batch_cell_shape, &do_tensor));
+
+    Tensor dcs_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                           batch_cell_shape, &dcs_tensor));
+
+    Tensor dci_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                           batch_cell_shape, &dci_tensor));
+
+    Tensor df_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                           batch_cell_shape, &df_tensor));
+
+    Tensor di_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                           batch_cell_shape, &di_tensor));
+
+    Tensor dicfo_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                      TensorShape({batch_size, cell_size * 4}),
+                                      &dicfo_tensor));
+
+    Tensor cs_grad_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                           batch_cell_shape, &cs_grad_tensor));
+
+    Tensor h_grad_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
+                                           batch_cell_shape, &h_grad_tensor));
+
+
+    const Device& device = ctx->eigen_device<Device>();
+    perftools::gputools::Stream* stream =
+        std::is_same<Device, GPUDevice>::value
+            ? ctx->op_device_context()->stream()
+            : nullptr;
+
+    functor::TensorZero<Device, T>()(device, cs_grad_tensor.flat<float>());
+    functor::TensorZero<Device, T>()(device,
+                                     cs_prev_grad_tensor->flat<float>());
+    functor::TensorZero<Device, T>()(device, h_grad_tensor.flat<float>());
+    functor::TensorZero<Device, T>()(device, h_prev_grad_tensor->flat<float>());
+    functor::TensorZero<Device, T>()(device, w_grad_tensor->flat<float>());
+    functor::TensorZero<Device, T>()(device, wci_grad_tensor->flat<float>());
+    functor::TensorZero<Device, T>()(device, wcf_grad_tensor->flat<float>());
+    functor::TensorZero<Device, T>()(device, wco_grad_tensor->flat<float>());
+    functor::TensorZero<Device, T>()(device, b_grad_tensor->flat<float>());
+
+    const int64 seq_len_max = seq_len_max_tensor->scalar<int64>()();
+    for (int64 t = seq_len_max - 1; t >= 0; --t) {
+      const Tensor& x_tensor = x_list[t];
+      const Tensor& cs_prev_tensor2 = t == 0 ? *cs_prev_tensor : cs_list[t - 1];
+      const Tensor& h_prev_tensor2 = t == 0 ? *h_prev_tensor : h_list[t - 1];
+      const Tensor& i_tensor = i_list[t];
+      const Tensor& cs_tensor = cs_list[t];
+      const Tensor& f_tensor = f_list[t];
+      const Tensor& o_tensor = o_list[t];
+      const Tensor& ci_tensor = ci_list[t];
+      const Tensor& co_tensor = co_list[t];
+
+      // Grab previous CS grad.
+      const Tensor& const_cs_prev_grad_tensor = *cs_prev_grad_tensor;
+      functor::TensorAdd<Device, T>()(
+          device, const_cs_prev_grad_tensor.flat<T>(),
+          cs_grad_list[t].flat<T>(), cs_grad_tensor.flat<T>());
+
+      // Combine previous h grad and h grad coming on top.
+      const Tensor& const_h_prev_grad_tensor = *h_prev_grad_tensor;
+      functor::TensorAdd<Device, T>()(
+          device, const_h_prev_grad_tensor.flat<T>(), h_grad_list[t].flat<T>(),
+          h_grad_tensor.flat<T>());
+
+      const Tensor& const_cs_grad_tensor = cs_grad_tensor;
+      const Tensor& const_h_grad_tensor = h_grad_tensor;
+
+      Tensor* x_grad_tensor = x_grad_list[t];
+      functor::FusedLSTMBprop<Device, T, USE_CUBLAS>(batch_size, input_size,
+                                                     cell_size)(
+          ctx, stream, device, use_peephole_, x_tensor.matrix<T>(),
+          cs_prev_tensor2.matrix<T>(), h_prev_tensor2.matrix<T>(),
+          w_tensor->matrix<T>(), wci_tensor->vec<T>(), wcf_tensor->vec<T>(),
+          wco_tensor->vec<T>(), b_tensor->vec<T>(), xh_tensor.matrix<T>(),
+          i_tensor.matrix<T>(), cs_tensor.matrix<T>(), f_tensor.matrix<T>(),
+          o_tensor.matrix<T>(), ci_tensor.matrix<T>(), co_tensor.matrix<T>(),
+          const_cs_grad_tensor.matrix<T>(), const_h_grad_tensor.matrix<T>(),
+          do_tensor.matrix<T>(), dcs_tensor.matrix<T>(), dci_tensor.matrix<T>(),
+          df_tensor.matrix<T>(), di_tensor.matrix<T>(),
+          dicfo_tensor.matrix<T>(), cs_prev_grad_tensor->matrix<T>(),
+          h_prev_grad_tensor->matrix<T>(), xh_grad_tensor.matrix<T>(),
+          x_grad_tensor->matrix<T>(), w_grad_tensor->matrix<T>(),
+          wci_grad_tensor->vec<T>(), wcf_grad_tensor->vec<T>(),
+          wco_grad_tensor->vec<T>(), b_grad_tensor->vec<T>());
+    }
+
+    for (int64 t = seq_len_max; t < max_len_; ++t) {
+      Tensor* x_grad_tensor = x_grad_list[t];
+      functor::TensorZero<Device, T>()(device, x_grad_tensor->flat<T>());
+    }
+  }
+
+ private:
+  int64 max_len_;
+  bool use_peephole_;
+};
+
+#define REGISTER_KERNEL(T)                                             \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("FusedLSTMGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      FusedLSTMGradOp<CPUDevice, T, false>);
+REGISTER_KERNEL(float);
+// REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                    \
+  template <>                                                                  \
+  void TensorCopy<GPUDevice, T>::operator()(const GPUDevice& d,                \
+                                            typename TTypes<T>::ConstFlat src, \
+                                            typename TTypes<T>::Flat dst);     \
+                                                                               \
+  template <>                                                                  \
+  void TensorAdd<GPUDevice, T>::operator()(                                    \
+      const GPUDevice& d, typename TTypes<T>::ConstFlat a,                     \
+      typename TTypes<T>::ConstFlat b, typename TTypes<T>::Flat c);            \
+                                                                               \
+  template <>                                                                  \
+  void FusedLSTMBprop<GPUDevice, T, true>::operator()(                         \
+      OpKernelContext* ctx, perftools::gputools::Stream* stream,               \
+      const GPUDevice& d, bool use_peephole,                                   \
+      typename TTypes<T>::ConstMatrix x,                                       \
+      typename TTypes<T>::ConstMatrix cs_prev,                                 \
+      typename TTypes<T>::ConstMatrix h_prev,                                  \
+      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,     \
+      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,      \
+      typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,           \
+      typename TTypes<T>::ConstMatrix i, typename TTypes<T>::ConstMatrix cs,   \
+      typename TTypes<T>::ConstMatrix f, typename TTypes<T>::ConstMatrix o,    \
+      typename TTypes<T>::ConstMatrix ci, typename TTypes<T>::ConstMatrix co,  \
+      typename TTypes<T>::ConstMatrix cs_grad,                                 \
+      typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_,  \
+      typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,          \
+      typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,            \
+      typename TTypes<T>::Matrix dicfo,                                        \
+      typename TTypes<T>::Matrix cs_prev_grad,                                 \
+      typename TTypes<T>::Matrix h_prev_grad,                                  \
+      typename TTypes<T>::Matrix xh_grad, typename TTypes<T>::Matrix x_grad,   \
+      typename TTypes<T>::Matrix w_grad, typename TTypes<T>::Vec wci_grad,     \
+      typename TTypes<T>::Vec wcf_grad, typename TTypes<T>::Vec wco_grad,      \
+      typename TTypes<T>::Vec b_grad);                                         \
+                                                                               \
+  extern template struct TensorCopy<GPUDevice, T>;                             \
+  extern template struct TensorAdd<GPUDevice, T>;                              \
+  extern template struct FusedLSTMBprop<GPUDevice, T, true>;
+
+DECLARE_GPU_SPEC(float);
+// DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // end namespace functor
+
+#define REGISTER_GPU_KERNEL(T)                           \
+  REGISTER_KERNEL_BUILDER(Name("FusedLSTMGrad")          \
+                              .Device(DEVICE_GPU)        \
+                              .HostMemory("seq_len_max") \
+                              .TypeConstraint<T>("T"),   \
+                          FusedLSTMGradOp<GPUDevice, T, true>);
+
+REGISTER_GPU_KERNEL(float);
+// REGISTER_GPU_KERNEL(double);
+#undef REGISTER_GPU_KERNEL
+#endif  // GOOGLE_CUDA
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.h b/tensorflow/contrib/rnn/kernels/lstm_ops.h
new file mode 100644
index 00000000000..bcb7bfa1e6e
--- /dev/null
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.h
@@ -0,0 +1,420 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/eigen_activations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace perftools {
+namespace gputools {
+class Stream;
+}  // end namespace gputools
+}  // end namespace perftools
+
+namespace tensorflow {
+class OpKernelContext;
+
+namespace functor {
+
+template <typename Device, typename T>
+struct TensorZero {
+  void operator()(const Device& d, typename TTypes<T>::Flat t) {
+    t.device(d) = t.constant(T(0));
+  }
+};
+
+template <typename Device, typename T>
+struct TensorCopy {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat src,
+                  typename TTypes<T>::Flat dst) {
+    dst.device(d) = src;
+  }
+};
+
+template <typename Device, typename T>
+struct TensorAdd {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat a,
+                  typename TTypes<T>::ConstFlat b, typename TTypes<T>::Flat c) {
+    c.device(d) = a + b;
+  }
+};
+
+template <typename Device, typename T>
+struct TensorZeroPadding {
+  void operator()(const Device& d, const int64 time_idx,
+                  typename TTypes<int64>::ConstVec seq_len,
+                  typename TTypes<float>::Vec mask,
+                  typename TTypes<float>::Matrix m) {
+    // mask is shape [batch_size].
+    mask.device(d) = seq_len.constant(time_idx) < seq_len;
+
+    // m_shape is [batch_size, 1].
+    Eigen::array<Eigen::DenseIndex, 2> m_shape({m.dimensions()[0], 1});
+    // broadcast_shape is [1, units].
+    Eigen::array<Eigen::DenseIndex, 2> broadcast_shape({1, m.dimensions()[1]});
+
+    // m is shape [batch_size, units].
+    m.device(d) = m * mask.reshape(m_shape).broadcast(broadcast_shape);
+  }
+};
+
+template <typename T>
+struct TensorCuBlasGemm {
+  void operator()(OpKernelContext* ctx, perftools::gputools::Stream* stream,
+                  bool transa, bool transb, uint64 m, uint64 n, uint64 k,
+                  T alpha, const T* a, int lda, const T* b, int ldb, T beta,
+                  T* c, int ldc);
+};
+
+template <typename Device, typename T, bool USE_CUBLAS>
+struct TensorBlasGemm;
+
+template <typename Device, typename T>
+struct TensorBlasGemm<Device, T, true /* USE_CUBLAS */> {
+  static void compute(OpKernelContext* ctx, perftools::gputools::Stream* stream,
+                      const Device& d, bool transa, bool transb, T alpha,
+                      typename TTypes<T>::ConstMatrix a,
+                      typename TTypes<T>::ConstMatrix b, T beta,
+                      typename TTypes<T>::Matrix c) {
+    int64 m = c.dimensions()[0];
+    int64 n = c.dimensions()[1];
+    int64 k = transa ? a.dimensions()[0] : a.dimensions()[1];
+
+    TensorCuBlasGemm<T>()(ctx, stream, transb, transa, n, m, k, alpha, b.data(),
+                          transb ? k : n, a.data(), transa ? m : k, beta,
+                          c.data(), n);
+  }
+};
+
+template <typename Device, typename T>
+struct TensorBlasGemm<Device, T, false /* USE_CUBLAS */> {
+  static void compute(OpKernelContext* ctx, perftools::gputools::Stream* stream,
+                      const Device& d, bool transa, bool transb, T alpha,
+                      typename TTypes<T>::ConstMatrix a,
+                      typename TTypes<T>::ConstMatrix b, T beta,
+                      typename TTypes<T>::Matrix c) {
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
+    contract_pairs[0] =
+        Eigen::IndexPair<Eigen::DenseIndex>(transa == false, transb == true);
+    if (alpha == T(1) && beta == T(0)) {
+      c.device(d) = a.contract(b, contract_pairs);
+    } else if (alpha == T(1) && beta == T(1)) {
+      c.device(d) += a.contract(b, contract_pairs);
+    } else {
+      c.device(d) = c.constant(alpha) * a.contract(b, contract_pairs) +
+                    c.constant(beta) * c;
+    }
+  }
+};
+
+struct LSTMFusedCell {
+  LSTMFusedCell(const int batch_size, const int input_size, const int cell_size)
+      : batch_size_(batch_size),
+        input_size_(input_size),
+        cell_size_(cell_size) {}
+
+  inline Eigen::array<Eigen::DenseIndex, 2> icfo_i_offsets() const {
+    return {0, 0};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> icfo_c_offsets() const {
+    return {0, cell_size_};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> icfo_f_offsets() const {
+    return {0, cell_size_ * 2};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> icfo_o_offsets() const {
+    return {0, cell_size_ * 3};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> cell_extents() const {
+    return {batch_size_, cell_size_};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> xh_x_offsets() const {
+    return {0, 0};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> xh_x_extents() const {
+    return {batch_size_, input_size_};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> xh_h_offsets() const {
+    return {0, input_size_};
+  }
+
+  inline Eigen::array<Eigen::DenseIndex, 2> xh_h_extents() const {
+    return {batch_size_, cell_size_};
+  }
+
+ protected:
+  const int batch_size_;
+  const int input_size_;
+  const int cell_size_;
+};
+
+template <typename Device, typename T, bool USE_CUBLAS>
+struct LSTMFusedCellFprop : public LSTMFusedCell {
+  LSTMFusedCellFprop(const int batch_size, const int input_size,
+                     const int cell_size)
+      : LSTMFusedCell(batch_size, input_size, cell_size) {}
+
+  void operator()(OpKernelContext* ctx, perftools::gputools::Stream* stream,
+                  const Device& d, const T forget_bias, const T cell_clip,
+                  bool use_peephole, typename TTypes<T>::ConstMatrix x,
+                  typename TTypes<T>::ConstMatrix cs_prev,
+                  typename TTypes<T>::ConstMatrix h_prev,
+                  typename TTypes<T>::ConstMatrix w,
+                  typename TTypes<T>::ConstVec wci,
+                  typename TTypes<T>::ConstVec wcf,
+                  typename TTypes<T>::ConstVec wco,
+                  typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,
+                  typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,
+                  typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,
+                  typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,
+                  typename TTypes<T>::Matrix icfo,
+                  typename TTypes<T>::Matrix h) {
+    // Concat xh = [x, h].
+    xh.slice(xh_x_offsets(), xh_x_extents()).device(d) = x;
+    xh.slice(xh_h_offsets(), xh_h_extents()).device(d) = h_prev;
+
+    // states1 = xh * w + b
+    typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
+    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
+        ctx, stream, d, false, false, T(1), const_xh, w, T(0), icfo);
+    Eigen::array<Eigen::DenseIndex, 2> b_shape({1, b.dimensions()[0]});
+    Eigen::array<Eigen::DenseIndex, 2> broadcast_shape({batch_size_, 1});
+    icfo.device(d) += b.reshape(b_shape).broadcast(broadcast_shape);
+
+    Eigen::array<Eigen::DenseIndex, 2> p_shape({1, cell_size_});
+    Eigen::array<Eigen::DenseIndex, 2> p_broadcast_shape({batch_size_, 1});
+
+    // Input gate.
+    if (use_peephole) {
+      auto i_peep = cs_prev * wci.reshape(p_shape).broadcast(p_broadcast_shape);
+      i.device(d) =
+          (icfo.slice(icfo_i_offsets(), cell_extents()) + i_peep).sigmoid();
+    } else {
+      i.device(d) = icfo.slice(icfo_i_offsets(), cell_extents()).sigmoid();
+    }
+
+    // Cell input.
+    ci.device(d) = icfo.slice(icfo_c_offsets(), cell_extents()).tanh();
+
+    // Forget gate (w/ bias).
+    if (use_peephole) {
+      auto f_peep = cs_prev * wcf.reshape(p_shape).broadcast(p_broadcast_shape);
+      f.device(d) = (icfo.slice(icfo_f_offsets(), cell_extents()) +
+                     f.constant(forget_bias) + f_peep)
+                        .sigmoid();
+    } else {
+      f.device(d) = (icfo.slice(icfo_f_offsets(), cell_extents()) +
+                     f.constant(forget_bias))
+                        .sigmoid();
+    }
+
+    // cs = ci .* i + f .* cs_prev
+    cs.device(d) = i * ci + f * cs_prev;
+
+    if (cell_clip > 0.0f) {
+      cs.device(d) =
+          cs.binaryExpr(cs.constant(cell_clip), Eigen::scalar_clip_op<T>());
+    }
+
+    // co = tanh(cs)
+    co.device(d) = cs.tanh();
+
+    // Output gate.
+    if (use_peephole) {
+      auto o_peep = cs * wco.reshape(p_shape).broadcast(p_broadcast_shape);
+      o.device(d) =
+          (icfo.slice(icfo_o_offsets(), cell_extents()) + o_peep).sigmoid();
+    } else {
+      o.device(d) = icfo.slice(icfo_o_offsets(), cell_extents()).sigmoid();
+    }
+
+    // h = o .* co
+    h.device(d) = o * co;
+  }
+};
+
+template <typename Device, typename T, bool USE_CUBLAS>
+struct LSTMFusedCellBprop : public LSTMFusedCell {
+  LSTMFusedCellBprop(const int batch_size, const int input_size,
+                     const int cell_size)
+      : LSTMFusedCell(batch_size, input_size, cell_size) {}
+
+  void operator()(
+      OpKernelContext* ctx, perftools::gputools::Stream* stream,
+      const Device& d, bool use_peephole, typename TTypes<T>::ConstMatrix x,
+      typename TTypes<T>::ConstMatrix cs_prev,
+      typename TTypes<T>::ConstMatrix h_prev, typename TTypes<T>::ConstMatrix w,
+      typename TTypes<T>::ConstVec wci, typename TTypes<T>::ConstVec wcf,
+      typename TTypes<T>::ConstVec wco, typename TTypes<T>::ConstVec b,
+      typename TTypes<T>::ConstMatrix i, typename TTypes<T>::ConstMatrix cs,
+      typename TTypes<T>::ConstMatrix f, typename TTypes<T>::ConstMatrix o,
+      typename TTypes<T>::ConstMatrix ci, typename TTypes<T>::ConstMatrix co,
+      typename TTypes<T>::ConstMatrix cs_grad,
+      typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_,
+      typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,
+      typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,
+      typename TTypes<T>::Matrix dicfo, typename TTypes<T>::Matrix cs_prev_grad,
+      typename TTypes<T>::Vec wci_grad, typename TTypes<T>::Vec wcf_grad,
+      typename TTypes<T>::Vec wco_grad) {
+    // do[t] = sigm'(o[t]) .* dh[t] .* co[t]
+    do_.device(d) = o * (o.constant(T(1)) - o) * h_grad * co;
+
+    // dcs[t] += tanh'(cs[t]) .* dh[t] .* o[t] + dcs[t + 1] .* f[t + 1]
+    dcs.device(d) = (co.constant(T(1)) - co * co) * h_grad * o + cs_grad;
+
+    Eigen::array<Eigen::DenseIndex, 2> p_shape({1, cell_size_});
+    Eigen::array<Eigen::DenseIndex, 2> p_broadcast_shape({batch_size_, 1});
+    if (use_peephole) {
+      dcs.device(d) =
+          dcs + do_ * wco.reshape(p_shape).broadcast(p_broadcast_shape);
+    }
+
+    // dci[t] = tanh'(ci[t]) dcs[t] i[t]
+    dci.device(d) = (ci.constant(T(1)) - ci * ci) * dcs * i;
+
+    // df[t] = sigm'(f[t]) dcs[t] cs[t - 1]
+    df.device(d) = f * (f.constant(T(1)) - f) * dcs * cs_prev;
+
+    // di[t] = sigm'(i[t]) dcs[t] ci[t]
+    di.device(d) = i * (i.constant(T(1)) - i) * dcs * ci;
+
+    dicfo.slice(icfo_i_offsets(), cell_extents()).device(d) = di;
+    dicfo.slice(icfo_c_offsets(), cell_extents()).device(d) = dci;
+    dicfo.slice(icfo_f_offsets(), cell_extents()).device(d) = df;
+    dicfo.slice(icfo_o_offsets(), cell_extents()).device(d) = do_;
+
+    cs_prev_grad.device(d) = dcs * f;
+    if (use_peephole) {
+      cs_prev_grad.device(d) =
+          cs_prev_grad +
+          di * wci.reshape(p_shape).broadcast(p_broadcast_shape) +
+          df * wcf.reshape(p_shape).broadcast(p_broadcast_shape);
+    }
+
+    if (use_peephole) {
+      wci_grad.device(d) = (di * cs_prev).sum(Eigen::array<int, 1>({0}));
+      wcf_grad.device(d) = (df * cs_prev).sum(Eigen::array<int, 1>({0}));
+      wco_grad.device(d) = (do_ * cs).sum(Eigen::array<int, 1>({0}));
+    }
+  }
+};
+
+template <typename Device, typename T, bool USE_CUBLAS>
+struct FusedLSTMBprop : public LSTMFusedCell {
+  FusedLSTMBprop(const int batch_size, const int input_size,
+                 const int cell_size)
+      : LSTMFusedCell(batch_size, input_size, cell_size) {}
+
+  void operator()(
+      OpKernelContext* ctx, perftools::gputools::Stream* stream,
+      const Device& d, bool use_peephole, typename TTypes<T>::ConstMatrix x,
+      typename TTypes<T>::ConstMatrix cs_prev,
+      typename TTypes<T>::ConstMatrix h_prev, typename TTypes<T>::ConstMatrix w,
+      typename TTypes<T>::ConstVec wci, typename TTypes<T>::ConstVec wcf,
+      typename TTypes<T>::ConstVec wco, typename TTypes<T>::ConstVec b,
+      typename TTypes<T>::Matrix xh, typename TTypes<T>::ConstMatrix i,
+      typename TTypes<T>::ConstMatrix cs, typename TTypes<T>::ConstMatrix f,
+      typename TTypes<T>::ConstMatrix o, typename TTypes<T>::ConstMatrix ci,
+      typename TTypes<T>::ConstMatrix co,
+      typename TTypes<T>::ConstMatrix cs_grad,
+      typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_,
+      typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,
+      typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,
+      typename TTypes<T>::Matrix dicfo, typename TTypes<T>::Matrix cs_prev_grad,
+      typename TTypes<T>::Matrix h_prev_grad,
+      typename TTypes<T>::Matrix xh_grad, typename TTypes<T>::Matrix x_grad,
+      typename TTypes<T>::Matrix w_grad, typename TTypes<T>::Vec wci_grad,
+      typename TTypes<T>::Vec wcf_grad, typename TTypes<T>::Vec wco_grad,
+      typename TTypes<T>::Vec b_grad) {
+    // do[t] = sigm'(o[t]) .* dh[t] .* co[t]
+    do_.device(d) = o * (o.constant(T(1)) - o) * h_grad * co;
+
+    // dcs[t] += tanh'(cs[t]) .* dh[t] .* o[t] + dcs[t + 1] .* f[t + 1]
+    dcs.device(d) = (co.constant(T(1)) - co * co) * h_grad * o + cs_grad;
+
+    Eigen::array<Eigen::DenseIndex, 2> p_shape({1, cell_size_});
+    Eigen::array<Eigen::DenseIndex, 2> p_broadcast_shape({batch_size_, 1});
+    if (use_peephole) {
+      dcs.device(d) =
+          dcs + do_ * wco.reshape(p_shape).broadcast(p_broadcast_shape);
+    }
+
+    // dci[t] = tanh'(ci[t]) dcs[t] i[t]
+    dci.device(d) = (ci.constant(T(1)) - ci * ci) * dcs * i;
+
+    // df[t] = sigm'(f[t]) dcs[t] cs[t - 1]
+    df.device(d) = f * (f.constant(T(1)) - f) * dcs * cs_prev;
+
+    // di[t] = sigm'(i[t]) dcs[t] ci[t]
+    di.device(d) = i * (i.constant(T(1)) - i) * dcs * ci;
+
+    dicfo.slice(icfo_i_offsets(), cell_extents()).device(d) = di;
+    dicfo.slice(icfo_c_offsets(), cell_extents()).device(d) = dci;
+    dicfo.slice(icfo_f_offsets(), cell_extents()).device(d) = df;
+    dicfo.slice(icfo_o_offsets(), cell_extents()).device(d) = do_;
+
+    cs_prev_grad.device(d) = dcs * f;
+    if (use_peephole) {
+      cs_prev_grad.device(d) =
+          cs_prev_grad +
+          di * wci.reshape(p_shape).broadcast(p_broadcast_shape) +
+          df * wcf.reshape(p_shape).broadcast(p_broadcast_shape);
+    }
+
+    // xh_grad.
+    typename TTypes<T>::ConstMatrix const_dicfo(dicfo.data(),
+                                                dicfo.dimensions());
+    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
+        ctx, stream, d, false, true, T(1), const_dicfo, w, T(0), xh_grad);
+
+    // xh.
+    xh.slice(xh_x_offsets(), xh_x_extents()).device(d) = x;
+    xh.slice(xh_h_offsets(), xh_h_extents()).device(d) = h_prev;
+    typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
+
+    // x_grad.
+    x_grad.device(d) = xh_grad.slice(xh_x_offsets(), xh_x_extents());
+    h_prev_grad.device(d) = xh_grad.slice(xh_h_offsets(), xh_h_extents());
+
+    // w_grad.
+    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
+        ctx, stream, d, true, false, T(1), const_xh, const_dicfo, T(1), w_grad);
+
+    // b_grad.
+    b_grad.device(d) += dicfo.sum(Eigen::array<int, 1>({0}));
+
+    if (use_peephole) {
+      wci_grad.device(d) += (di * cs_prev).sum(Eigen::array<int, 1>({0}));
+      wcf_grad.device(d) += (df * cs_prev).sum(Eigen::array<int, 1>({0}));
+      wco_grad.device(d) += (do_ * cs).sum(Eigen::array<int, 1>({0}));
+    }
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc b/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc
new file mode 100644
index 00000000000..2c5e500c289
--- /dev/null
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc
@@ -0,0 +1,41 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/contrib/rnn/kernels/lstm_ops.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define DEFINE_GPU_SPECS(T)                               \
+  template struct TensorZero<GPUDevice, T>;               \
+  template struct TensorCopy<GPUDevice, T>;               \
+  template struct TensorAdd<GPUDevice, T>;                \
+  template struct LSTMFusedCellFprop<GPUDevice, T, true>; \
+  template struct LSTMFusedCellBprop<GPUDevice, T, true>; \
+  template struct FusedLSTMBprop<GPUDevice, T, true>;
+
+DEFINE_GPU_SPECS(float);
+// DEFINE_GPU_SPECS(double);
+#undef DEFINE_GPU_SPECS
+
+}  // end namespace functor
+}  // end namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/rnn/ops/lstm_ops.cc b/tensorflow/contrib/rnn/ops/lstm_ops.cc
new file mode 100644
index 00000000000..a55c6232886
--- /dev/null
+++ b/tensorflow/contrib/rnn/ops/lstm_ops.cc
@@ -0,0 +1,180 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("LSTMFusedCell")
+    .Input("x: T")
+    .Input("cs_prev: T")
+    .Input("h_prev: T")
+    .Input("w: T")
+    .Input("wci: T")
+    .Input("wcf: T")
+    .Input("wco: T")
+    .Input("b: T")
+    .Output("i: T")
+    .Output("cs: T")
+    .Output("f: T")
+    .Output("o: T")
+    .Output("ci: T")
+    .Output("co: T")
+    .Output("h: T")
+    .Attr("forget_bias: float = 1.0")
+    .Attr("cell_clip: float = 3.0")
+    .Attr("use_peephole: bool = false")
+    .Attr("T: {float}")
+    .Doc(R"doc(
+Computes the LSTM cell forward propagation for 1 time step.
+
+This implementation uses 1 weight matrix and 1 bias vector, there is no
+diagonal peephole connection.
+
+This kernel op implements the following mathematical equations:
+
+```python
+xh = [x, h_prev]
+[i, f, ci, o] = xh * w + b
+f = f + forget_bias
+
+i = sigmoid(i)
+f = sigmoid(f)
+ci = tanh(ci)
+o = sigmoid(o)
+
+cs = ci .* i + cs_prev .* f
+co = tanh(cs)
+
+h = co .* o
+```
+
+forget_bias: The forget gate bias.
+x: The input to the LSTM cell.
+w: The weight matrix.
+b: The bias vector.
+i: The input gate.
+cs: The cell state before the tanh.
+f: The forget gate.
+o: The output gate.
+ci: The cell input.
+co: The cell after the tanh.
+h: The output h vector.
+)doc");
+
+REGISTER_OP("LSTMFusedCellGrad")
+    .Input("x: T")
+    .Input("cs_prev: T")
+    .Input("h_prev: T")
+    .Input("w: T")
+    .Input("wci: T")
+    .Input("wcf: T")
+    .Input("wco: T")
+    .Input("b: T")
+    .Input("i: T")
+    .Input("cs: T")
+    .Input("f: T")
+    .Input("o: T")
+    .Input("ci: T")
+    .Input("co: T")
+    .Input("cs_grad: T")
+    .Input("h_grad: T")
+    .Output("cs_prev_grad: T")
+    .Output("dicfo: T")
+    .Output("wci_grad: T")
+    .Output("wcf_grad: T")
+    .Output("wco_grad: T")
+    .Attr("use_peephole: bool")
+    .Attr("T: {float}")
+    .Doc(R"doc(
+Computes the LSTM cell backward propagation for 1 timestep.
+
+This implementation is to be used in conjunction of LSTMFusedCell.
+
+x: The input to the LSTM cell.
+cs_prev: The previous cell state.
+h_prev: The previous h state.
+w: The weight matrix.
+b: The bias vector.
+i: The input gate.
+cs: The cell state before the tanh.
+f: The forget gate.
+o: The output gate.
+ci: The cell input.
+co: The cell after the tanh.
+h_grad: THe gradient of h vector.
+cs_prev_grad: The gradient of cs.
+dicfo: The derivative wrt to [i, cs, f, o].
+)doc");
+
+REGISTER_OP("FusedLSTM")
+    .Input("seq_len_max: int64")
+    .Input("x: max_len * T")
+    .Input("cs_prev: T")
+    .Input("h_prev: T")
+    .Input("w: T")
+    .Input("wci: T")
+    .Input("wcf: T")
+    .Input("wco: T")
+    .Input("b: T")
+    .Output("i: max_len * T")
+    .Output("cs: max_len * T")
+    .Output("f: max_len * T")
+    .Output("o: max_len * T")
+    .Output("ci: max_len * T")
+    .Output("co: max_len * T")
+    .Output("h: max_len * T")
+    .Attr("max_len: int")
+    .Attr("forget_bias: float = 1.0")
+    .Attr("cell_clip: float = 3.0")
+    .Attr("use_peephole: bool = false")
+    .Attr("T: {float}")
+    .Doc(R"doc(
+)doc");
+
+REGISTER_OP("FusedLSTMGrad")
+    .Input("seq_len_max: int64")
+    .Input("x: max_len * T")
+    .Input("cs_prev: T")
+    .Input("h_prev: T")
+    .Input("w: T")
+    .Input("wci: T")
+    .Input("wcf: T")
+    .Input("wco: T")
+    .Input("b: T")
+    .Input("i: max_len * T")
+    .Input("cs: max_len * T")
+    .Input("f: max_len * T")
+    .Input("o: max_len * T")
+    .Input("ci: max_len * T")
+    .Input("co: max_len * T")
+    .Input("h: max_len * T")
+    .Input("cs_grad: max_len * T")
+    .Input("h_grad: max_len * T")
+    .Output("x_grad: max_len * T")
+    .Output("cs_prev_grad: T")
+    .Output("h_prev_grad: T")
+    .Output("w_grad: T")
+    .Output("wci_grad: T")
+    .Output("wcf_grad: T")
+    .Output("wco_grad: T")
+    .Output("b_grad: T")
+    .Attr("max_len: int")
+    .Attr("use_peephole: bool")
+    .Attr("T: {float}")
+    .Doc(R"doc(
+)doc");
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
new file mode 100644
index 00000000000..70aeb5ff559
--- /dev/null
+++ b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
@@ -0,0 +1,290 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""LSTM Fused Cell ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+import tensorflow as tf
+
+from tensorflow.contrib.rnn.python.ops import lstm_ops
+
+
+fused_lstm = lstm_ops._fused_lstm  # pylint: disable=protected-access
+
+
+class LSTMFusedCellTest(tf.test.TestCase):
+  _use_gpu = False
+
+  def testNoneDimsWithDynamicRNN(self):
+    with self.test_session(use_gpu=self._use_gpu, graph=tf.Graph()) as sess:
+      batch_size = 4
+      num_steps = 5
+      input_dim = 6
+      cell_size = 7
+
+      cell = tf.contrib.rnn.LSTMFusedCell(cell_size)
+      x = tf.placeholder(tf.float32, shape=(None, None, input_dim))
+
+      output, _ = tf.nn.dynamic_rnn(cell, x, time_major=True, dtype=tf.float32)
+      sess.run(tf.initialize_all_variables())
+      feed = {}
+      feed[x] = np.random.randn(num_steps, batch_size, input_dim)
+      sess.run(output, feed)
+
+  def testLSTMFusedCell(self):
+    with self.test_session(use_gpu=self._use_gpu, graph=tf.Graph()) as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        x = tf.zeros([1, 2])
+        m0 = tf.zeros([1, 2])
+        m1 = tf.zeros([1, 2])
+        m2 = tf.zeros([1, 2])
+        m3 = tf.zeros([1, 2])
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.nn.rnn_cell.MultiRNNCell(
+            [tf.contrib.rnn.LSTMFusedCell(2)] * 2,
+            state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
+        sess.run([tf.initialize_all_variables()])
+        res = sess.run([g, out_m0, out_m1, out_m2, out_m3],
+                       {x.name: np.array([[1., 1.]]),
+                        m0.name: 0.1 * np.ones([1, 2]),
+                        m1.name: 0.1 * np.ones([1, 2]),
+                        m2.name: 0.1 * np.ones([1, 2]),
+                        m3.name: 0.1 * np.ones([1, 2])})
+        self.assertEqual(len(res), 5)
+        self.assertAllClose(res[0], [[0.24024698, 0.24024698]])
+        # These numbers are from testBasicLSTMCell and only test c/h.
+        self.assertAllClose(res[1], [[0.68967271, 0.68967271]])
+        self.assertAllClose(res[2], [[0.44848421, 0.44848421]])
+        self.assertAllClose(res[3], [[0.39897051, 0.39897051]])
+        self.assertAllClose(res[4], [[0.24024698, 0.24024698]])
+
+  def testLSTMBasicToBlockCell(self):
+    with self.test_session(use_gpu=self._use_gpu) as sess:
+      x = tf.zeros([1, 2])
+      x_values = np.random.randn(1, 2)
+
+      m0_val = 0.1 * np.ones([1, 2])
+      m1_val = -0.1 * np.ones([1, 2])
+      m2_val = -0.2 * np.ones([1, 2])
+      m3_val = 0.2 * np.ones([1, 2])
+
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=19890212)
+      with tf.variable_scope("basic", initializer=initializer):
+        m0 = tf.zeros([1, 2])
+        m1 = tf.zeros([1, 2])
+        m2 = tf.zeros([1, 2])
+        m3 = tf.zeros([1, 2])
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.nn.rnn_cell.MultiRNNCell(
+            [tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=True)] * 2,
+            state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
+        sess.run([tf.initialize_all_variables()])
+        basic_res = sess.run([g, out_m0, out_m1, out_m2, out_m3],
+                             {x.name: x_values,
+                              m0.name: m0_val,
+                              m1.name: m1_val,
+                              m2.name: m2_val,
+                              m3.name: m3_val})
+
+      with tf.variable_scope("block", initializer=initializer):
+        m0 = tf.zeros([1, 2])
+        m1 = tf.zeros([1, 2])
+        m2 = tf.zeros([1, 2])
+        m3 = tf.zeros([1, 2])
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.nn.rnn_cell.MultiRNNCell(
+            [tf.contrib.rnn.LSTMFusedCell(2)] * 2,
+            state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
+        sess.run([tf.initialize_all_variables()])
+        block_res = sess.run([g, out_m0, out_m1, out_m2, out_m3],
+                             {x.name: x_values,
+                              m0.name: m0_val,
+                              m1.name: m1_val,
+                              m2.name: m2_val,
+                              m3.name: m3_val})
+
+      self.assertEqual(len(basic_res), len(block_res))
+      for basic, block in zip(basic_res, block_res):
+        self.assertAllClose(basic, block)
+
+  def testLSTMBasicToBlockCellPeeping(self):
+    with self.test_session(use_gpu=self._use_gpu) as sess:
+      x = tf.zeros([1, 2])
+      x_values = np.random.randn(1, 2)
+
+      m0_val = 0.1 * np.ones([1, 2])
+      m1_val = -0.1 * np.ones([1, 2])
+      m2_val = -0.2 * np.ones([1, 2])
+      m3_val = 0.2 * np.ones([1, 2])
+
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=19890212)
+      with tf.variable_scope("basic", initializer=initializer):
+        m0 = tf.zeros([1, 2])
+        m1 = tf.zeros([1, 2])
+        m2 = tf.zeros([1, 2])
+        m3 = tf.zeros([1, 2])
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.nn.rnn_cell.MultiRNNCell(
+            [tf.nn.rnn_cell.LSTMCell(2,
+                                     use_peepholes=True,
+                                     state_is_tuple=True)] * 2,
+            state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
+        sess.run([tf.initialize_all_variables()])
+        basic_res = sess.run([g, out_m0, out_m1, out_m2, out_m3],
+                             {x.name: x_values,
+                              m0.name: m0_val,
+                              m1.name: m1_val,
+                              m2.name: m2_val,
+                              m3.name: m3_val})
+
+      with tf.variable_scope("block", initializer=initializer):
+        m0 = tf.zeros([1, 2])
+        m1 = tf.zeros([1, 2])
+        m2 = tf.zeros([1, 2])
+        m3 = tf.zeros([1, 2])
+        g, ((out_m0, out_m1), (out_m2, out_m3)) = tf.nn.rnn_cell.MultiRNNCell(
+            [tf.contrib.rnn.LSTMFusedCell(2, use_peephole=True)] * 2,
+            state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
+        sess.run([tf.initialize_all_variables()])
+        block_res = sess.run([g, out_m0, out_m1, out_m2, out_m3],
+                             {x.name: x_values,
+                              m0.name: m0_val,
+                              m1.name: m1_val,
+                              m2.name: m2_val,
+                              m3.name: m3_val})
+
+      self.assertEqual(len(basic_res), len(block_res))
+      for basic, block in zip(basic_res, block_res):
+        self.assertAllClose(basic, block)
+
+  def testLSTMBasicToBlock(self):
+    with self.test_session(use_gpu=self._use_gpu) as sess:
+      batch_size = 2
+      input_size = 3
+      cell_size = 4
+      sequence_length = 5
+
+      inputs = []
+      for _ in range(sequence_length):
+        inp = tf.convert_to_tensor(
+            np.random.randn(batch_size, input_size),
+            dtype=tf.float32)
+        inputs.append(inp)
+
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=19890212)
+      with tf.variable_scope("basic", initializer=initializer):
+        cell = tf.nn.rnn_cell.BasicLSTMCell(cell_size, state_is_tuple=True)
+        outputs, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
+
+        sess.run([tf.initialize_all_variables()])
+        basic_outputs = sess.run(outputs)
+        basic_grads = sess.run(tf.gradients(outputs, inputs))
+        basic_wgrads = sess.run(tf.gradients(outputs, tf.trainable_variables()))
+
+      with tf.variable_scope("block", initializer=initializer):
+        w = tf.get_variable("w",
+                            shape=[input_size + cell_size, cell_size * 4],
+                            dtype=tf.float32)
+        b = tf.get_variable("b",
+                            shape=[cell_size * 4],
+                            dtype=tf.float32,
+                            initializer=tf.zeros_initializer)
+
+        _, _, _, _, _, _, outputs = fused_lstm(
+            tf.convert_to_tensor(sequence_length,
+                                 dtype=tf.int64),
+            inputs,
+            w,
+            b,
+            cell_clip=0)
+
+        sess.run([tf.initialize_all_variables()])
+        block_outputs = sess.run(outputs)
+        block_grads = sess.run(tf.gradients(outputs, inputs))
+        block_wgrads = sess.run(tf.gradients(outputs, [w, b]))
+
+      self.assertAllClose(basic_outputs, block_outputs)
+      self.assertAllClose(basic_grads, block_grads)
+      for basic, block in zip(basic_wgrads, block_wgrads):
+        self.assertAllClose(basic, block, rtol=1e-2, atol=1e-2)
+
+  def testLSTMBasicToBlockPeeping(self):
+    with self.test_session(use_gpu=self._use_gpu) as sess:
+      batch_size = 2
+      input_size = 3
+      cell_size = 4
+      sequence_length = 5
+
+      inputs = []
+      for _ in range(sequence_length):
+        inp = tf.convert_to_tensor(
+            np.random.randn(batch_size, input_size),
+            dtype=tf.float32)
+        inputs.append(inp)
+
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=19890212)
+      with tf.variable_scope("basic", initializer=initializer):
+        cell = tf.nn.rnn_cell.LSTMCell(cell_size,
+                                       use_peepholes=True,
+                                       state_is_tuple=True)
+        outputs, _ = tf.nn.rnn(cell, inputs, dtype=tf.float32)
+
+        sess.run([tf.initialize_all_variables()])
+        basic_outputs = sess.run(outputs)
+        basic_grads = sess.run(tf.gradients(outputs, inputs))
+        basic_wgrads = sess.run(tf.gradients(outputs, tf.trainable_variables()))
+
+      with tf.variable_scope("block", initializer=initializer):
+        w = tf.get_variable("w",
+                            shape=[input_size + cell_size, cell_size * 4],
+                            dtype=tf.float32)
+        b = tf.get_variable("b",
+                            shape=[cell_size * 4],
+                            dtype=tf.float32,
+                            initializer=tf.zeros_initializer)
+
+        wci = tf.get_variable("wci", shape=[cell_size], dtype=tf.float32)
+        wcf = tf.get_variable("wcf", shape=[cell_size], dtype=tf.float32)
+        wco = tf.get_variable("wco", shape=[cell_size], dtype=tf.float32)
+
+        _, _, _, _, _, _, outputs = fused_lstm(
+            tf.convert_to_tensor(sequence_length,
+                                 dtype=tf.int64),
+            inputs,
+            w,
+            b,
+            wci=wci,
+            wcf=wcf,
+            wco=wco,
+            cell_clip=0,
+            use_peephole=True)
+
+        sess.run([tf.initialize_all_variables()])
+        block_outputs = sess.run(outputs)
+        block_grads = sess.run(tf.gradients(outputs, inputs))
+        block_wgrads = sess.run(tf.gradients(outputs, [w, b, wci, wcf, wco]))
+
+      self.assertAllClose(basic_outputs, block_outputs)
+      self.assertAllClose(basic_grads, block_grads)
+      for basic, block in zip(basic_wgrads, block_wgrads):
+        self.assertAllClose(basic, block, rtol=1e-2, atol=1e-2)
+
+
+class LSTMFusedCellGpuTest(LSTMFusedCellTest):
+  _use_gpu = True
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
new file mode 100644
index 00000000000..231d13caa6c
--- /dev/null
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -0,0 +1,456 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""LSTM Fused Cell ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import load_library
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import resource_loader
+
+_lstm_ops_so = load_library.load_op_library(
+    resource_loader.get_path_to_datafile("_lstm_ops.so"))
+assert _lstm_ops_so, "Could not load _lstm_ops.so."
+
+
+# pylint: disable=invalid-name
+def _lstm_fused_cell(x,
+                     cs_prev,
+                     h_prev,
+                     w,
+                     b,
+                     wci=None,
+                     wcf=None,
+                     wco=None,
+                     forget_bias=None,
+                     cell_clip=None,
+                     use_peephole=None,
+                     name=None):
+  r"""Computes the LSTM cell forward propagation for 1 time step.
+
+  This implementation uses 1 weight matrix and 1 bias vector, there is no
+  diagonal peephole connection.
+
+  This kernel op implements the following mathematical equations:
+
+  ```python
+  xh = [x, h_prev]
+  [i, f, ci, o] = xh * w + b
+  f = f + forget_bias
+
+  i = sigmoid(i)
+  f = sigmoid(f)
+  ci = tanh(ci)
+  o = sigmoid(o)
+
+  cs = ci .* i + cs_prev .* f
+  co = tanh(cs)
+
+  h = co .* o
+  ```
+
+  Args:
+    x: A `Tensor`. Must be one of the following types: `float32`, `float64`.
+      The input to the LSTM cell.
+    cs_prev: A `Tensor`. Must have the same type as `x`.
+    h_prev: A `Tensor`. Must have the same type as `x`.
+    w: A `Tensor`. Must have the same type as `x`. The weight matrix.
+    b: A `Tensor`. Must have the same type as `x`. The bias vector.
+    wci: A `Tensor`. Must have the same type as `x`.
+    wcf: A `Tensor`. Must have the same type as `x`.
+    wco: A `Tensor`. Must have the same type as `x`.
+    forget_bias: An optional `float`. Defaults to `1`. The forget gate bias.
+    cell_clip: An optional `float`. Defaults to `3`.
+    use_peephole: An optional `bool`. Defaults to `False`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `Tensor` objects (i, cs, f, o, ci, co, h).
+    i: A `Tensor`. Has the same type as `x`. The input gate.
+    cs: A `Tensor`. Has the same type as `x`. The cell state before the tanh.
+    f: A `Tensor`. Has the same type as `x`. The forget gate.
+    o: A `Tensor`. Has the same type as `x`. The output gate.
+    ci: A `Tensor`. Has the same type as `x`. The cell input.
+    co: A `Tensor`. Has the same type as `x`. The cell after the tanh.
+    h: A `Tensor`. Has the same type as `x`. The output h vector.
+
+  Raises:
+    ValueError: If cell_size is None.
+  """
+  if wci is None:
+    cell_size = cs_prev.get_shape().with_rank(2)[1].value
+    if cell_size is None:
+      raise ValueError("cell_size from `cs_prev` should not be None.")
+    wci = array_ops.constant(0, dtype=dtypes.float32, shape=[cell_size])
+    wco = wci
+    wcf = wci
+
+  # pylint: disable=protected-access
+  return _lstm_ops_so.lstm_fused_cell(x=x,
+                                      cs_prev=cs_prev,
+                                      h_prev=h_prev,
+                                      w=w,
+                                      wci=wci,
+                                      wco=wco,
+                                      wcf=wcf,
+                                      b=b,
+                                      forget_bias=forget_bias,
+                                      cell_clip=cell_clip,
+                                      use_peephole=use_peephole,
+                                      name=name)
+  # pylint: enable=protected-access
+
+
+def _fused_lstm(seq_len_max,
+                x,
+                w,
+                b,
+                cs_prev=None,
+                h_prev=None,
+                wci=None,
+                wcf=None,
+                wco=None,
+                forget_bias=None,
+                cell_clip=None,
+                use_peephole=None,
+                name=None):
+  r"""TODO(williamchan): add doc.
+
+  Args:
+    seq_len_max: A `Tensor` of type `int64`.
+    x: A list of at least 1 `Tensor` objects of the same type in: `float32`.
+    w: A `Tensor`. Must have the same type as `x`.
+    b: A `Tensor`. Must have the same type as `x`.
+    cs_prev: A `Tensor`. Must have the same type as `x`.
+    h_prev: A `Tensor`. Must have the same type as `x`.
+    wci: A `Tensor`. Must have the same type as `x`.
+    wcf: A `Tensor`. Must have the same type as `x`.
+    wco: A `Tensor`. Must have the same type as `x`.
+    forget_bias: An optional `float`. Defaults to `1`.
+    cell_clip: An optional `float`. Defaults to `3`.
+    use_peephole: An optional `bool`. Defaults to `False`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `Tensor` objects (i, cs, f, o, ci, co, h).
+    i: A list with the same number of `Tensor` objects as `x` of `Tensor`
+    objects of the same type as x.
+    cs: A list with the same number of `Tensor` objects as `x` of `Tensor`
+    objects of the same type as x.
+    f: A list with the same number of `Tensor` objects as `x` of `Tensor`
+    objects of the same type as x.
+    o: A list with the same number of `Tensor` objects as `x` of `Tensor`
+    objects of the same type as x.
+    ci: A list with the same number of `Tensor` objects as `x` of `Tensor`
+    objects of the same type as x.
+    co: A list with the same number of `Tensor` objects as `x` of `Tensor`
+    objects of the same type as x.
+    h: A list with the same number of `Tensor` objects as `x` of `Tensor`
+    objects of the same type as x.
+
+  Raises:
+    ValueError: If `b` does not have a valid shape.
+  """
+  batch_size = x[0].get_shape().with_rank(2)[0].value
+  cell_size4 = b.get_shape().with_rank(1)[0].value
+  if cell_size4 is None:
+    raise ValueError("`b` shape must not be None.")
+  cell_size = cell_size4 / 4
+  zero_state = None
+  if cs_prev is None or h_prev is None:
+    zero_state = array_ops.constant(0,
+                                    dtype=dtypes.float32,
+                                    shape=[batch_size, cell_size])
+  if cs_prev is None:
+    cs_prev = zero_state
+  if h_prev is None:
+    h_prev = zero_state
+  if wci is None:
+    wci = array_ops.constant(0, dtype=dtypes.float32, shape=[cell_size])
+    wco = wci
+    wcf = wci
+
+  # pylint: disable=protected-access
+  return _lstm_ops_so.fused_lstm(seq_len_max=seq_len_max,
+                                 x=x,
+                                 cs_prev=cs_prev,
+                                 h_prev=h_prev,
+                                 w=w,
+                                 wci=wci,
+                                 wco=wco,
+                                 wcf=wcf,
+                                 b=b,
+                                 forget_bias=forget_bias,
+                                 cell_clip=cell_clip,
+                                 name=name,
+                                 use_peephole=use_peephole)
+  # pylint: enable=protected-access
+  # pylint: enable=invalid-name
+
+
+ops.RegisterShape("LSTMFusedCell")(None)
+_lstm_fused_cell_grad_outputs = ["cs_prev_grad", "dicfo"]
+
+
+@ops.RegisterShape("LSTMFusedCell")
+def _LSTMFusedCellShape(op):
+  batch_size = op.inputs[0].get_shape().with_rank(2)[0].value
+  cell_size = op.inputs[1].get_shape().with_rank(2)[1].value
+
+  return (tensor_shape.TensorShape([batch_size, cell_size]),
+          tensor_shape.TensorShape([batch_size, cell_size]),
+          tensor_shape.TensorShape([batch_size, cell_size]),
+          tensor_shape.TensorShape([batch_size, cell_size]),
+          tensor_shape.TensorShape([batch_size, cell_size]),
+          tensor_shape.TensorShape([batch_size, cell_size]),
+          tensor_shape.TensorShape([batch_size, cell_size]))
+
+
+@ops.RegisterGradient("LSTMFusedCell")
+def _LSTMFusedCellGrad(op, *grad):
+  """Gradient for LSTMFusedCell."""
+  (x, cs_prev, h_prev, w, wci, wco, wcf, b) = op.inputs
+  (i, cs, f, o, ci, co, _) = op.outputs
+  (_, cs_grad, _, _, _, _, h_grad) = grad
+
+  batch_size = x.get_shape().with_rank(2)[0].value
+  if batch_size is None:
+    batch_size = -1
+  input_size = x.get_shape().with_rank(2)[1].value
+  if input_size is None:
+    raise ValueError("input_size from `x` should not be None.")
+  cell_size = cs_prev.get_shape().with_rank(2)[1].value
+  if cell_size is None:
+    raise ValueError("cell_size from `cs_prev` should not be None.")
+
+  (cs_prev_grad, dicfo, wci_grad, wcf_grad,
+   wco_grad) = _lstm_ops_so.lstm_fused_cell_grad(
+       x,
+       cs_prev,
+       h_prev,
+       w,
+       wci,
+       wcf,
+       wco,
+       b,
+       i,
+       cs,
+       f,
+       o,
+       ci,
+       co,
+       cs_grad,
+       h_grad,
+       use_peephole=op.get_attr("use_peephole"))
+
+  # Backprop from dicfo to xh.
+  xh_grad = math_ops.matmul(dicfo, w, transpose_b=True)
+
+  x_grad = array_ops.slice(xh_grad, (0, 0), (batch_size, input_size))
+  x_grad.get_shape().merge_with(x.get_shape())
+
+  h_prev_grad = array_ops.slice(xh_grad, (0, input_size),
+                                (batch_size, cell_size))
+  h_prev_grad.get_shape().merge_with(h_prev.get_shape())
+
+  # Backprop from dicfo to w.
+  xh = array_ops.concat(1, [x, h_prev])
+  w_grad = math_ops.matmul(xh, dicfo, transpose_a=True)
+  w_grad.get_shape().merge_with(w.get_shape())
+
+  # Backprop from dicfo to b.
+  b_grad = nn_ops.bias_add_grad(dicfo)
+  b_grad.get_shape().merge_with(b.get_shape())
+
+  return (x_grad, cs_prev_grad, h_prev_grad, w_grad, wci_grad, wcf_grad,
+          wco_grad, b_grad)
+
+
+@ops.RegisterShape("LSTMFusedCellGrad")
+def _LSTMFusedCellGradShape(op):
+  batch_size = op.inputs[0].get_shape().with_rank(2)[0].value
+  cell_size = op.inputs[1].get_shape().with_rank(2)[1].value
+
+  return [tensor_shape.TensorShape([batch_size, cell_size]),
+          tensor_shape.TensorShape([batch_size, cell_size * 4]),
+          tensor_shape.TensorShape([cell_size]),
+          tensor_shape.TensorShape([cell_size]),
+          tensor_shape.TensorShape([cell_size])]
+
+
+@ops.RegisterShape("FusedLSTM")
+def _FusedLSTMShape(op):
+  max_len = op.get_attr("max_len")
+
+  x = op.inputs[1]
+  b = op.inputs[-1]
+
+  batch_size = x.get_shape().with_rank(2)[0].value
+  cell_size = b.get_shape().with_rank(1)[0].value / 4
+
+  return [tensor_shape.TensorShape([batch_size, cell_size])] * max_len * 7
+
+
+@ops.RegisterGradient("FusedLSTM")
+def _FusedLSTMGrad(op, *grad):
+  """Gradient for FusedLSTM."""
+  max_len = op.get_attr("max_len")
+
+  seq_len_max = op.inputs[0]
+  x = op.inputs[1:1 + max_len]
+  cs_prev = op.inputs[-7]
+  h_prev = op.inputs[-6]
+  w = op.inputs[-5]
+  wci = op.inputs[-4]
+  wco = op.inputs[-3]
+  wcf = op.inputs[-2]
+  b = op.inputs[-1]
+
+  i = op.outputs[0 * max_len:1 * max_len]
+  cs = op.outputs[1 * max_len:2 * max_len]
+  f = op.outputs[2 * max_len:3 * max_len]
+  o = op.outputs[3 * max_len:4 * max_len]
+  ci = op.outputs[4 * max_len:5 * max_len]
+  co = op.outputs[5 * max_len:6 * max_len]
+  h = op.outputs[6 * max_len:7 * max_len]
+
+  cs_grad = grad[-max_len * 2:-max_len]
+  h_grad = grad[-max_len:]
+
+  (x_grad, cs_prev_grad, h_prev_grad, w_grad, wci_grad, wco_grad, wcf_grad,
+   b_grad) = _lstm_ops_so.fused_lstm_grad(
+       seq_len_max,
+       x,
+       cs_prev,
+       h_prev,
+       w,
+       wci,
+       wco,
+       wcf,
+       b,
+       i,
+       cs,
+       f,
+       o,
+       ci,
+       co,
+       h,
+       cs_grad,
+       h_grad,
+       use_peephole=op.get_attr("use_peephole"))
+
+  return [None] + x_grad + [cs_prev_grad, h_prev_grad, w_grad, wci_grad,
+                            wco_grad, wcf_grad, b_grad]
+
+
+@ops.RegisterShape("FusedLSTMGrad")
+def _FusedLSTMGradShape(op):
+  """Shape for FusedLSTM."""
+  max_len = op.get_attr("max_len")
+
+  x = op.inputs[1]
+  cs_prev = op.inputs[1 + max_len]
+  h_prev = op.inputs[2 + max_len]
+  w = op.inputs[3 + max_len]
+  wci = op.inputs[4 + max_len]
+  wco = op.inputs[5 + max_len]
+  wcf = op.inputs[6 + max_len]
+  b = op.inputs[7 + max_len]
+
+  x_shape = x.get_shape().with_rank(2)
+  cs_prev_shape = cs_prev.get_shape().with_rank(2)
+  h_prev_shape = h_prev.get_shape().with_rank(2)
+  w_shape = w.get_shape().with_rank(2)
+  wci_shape = wci.get_shape().with_rank(1)
+  wco_shape = wco.get_shape().with_rank(1)
+  wcf_shape = wcf.get_shape().with_rank(1)
+  b_shape = b.get_shape().with_rank(1)
+
+  return [x_shape] * max_len + [cs_prev_shape, h_prev_shape, w_shape, wci_shape,
+                                wco_shape, wcf_shape, b_shape]
+
+
+class LSTMFusedCell(rnn_cell.RNNCell):
+  """Basic LSTM recurrent network cell.
+
+  The implementation is based on: http://arxiv.org/abs/1409.2329.
+
+  We add forget_bias (default: 1) to the biases of the forget gate in order to
+  reduce the scale of forgetting in the beginning of the training.
+
+  Unlike BasicLSTMCell, this is a monolithic op and should be much faster. The
+  weight and bias matrixes should be compatible as long as the variabel scope
+  matches.
+  """
+
+  def __init__(self, num_units, forget_bias=1.0, use_peephole=False):
+    """Initialize the basic LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the LSTM cell.
+      forget_bias: float, The bias added to forget gates (see above).
+      use_peephole: Whether to use peephole connections or not.
+    """
+    self._num_units = num_units
+    self._forget_bias = forget_bias
+    self._use_peephole = use_peephole
+
+  @property
+  def state_size(self):
+    return (self._num_units,) * 2
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def __call__(self, x, states_prev, scope=None):
+    """Long short-term memory cell (LSTM)."""
+    with vs.variable_scope(scope or type(self).__name__):
+      x_shape = x.get_shape().with_rank(2)
+      if not x_shape[1]:
+        raise ValueError("Expecting x_shape[1] to be sets: %s" % str(x_shape))
+      if len(states_prev) != 2:
+        raise ValueError("Expecting states_prev to be a tuple with length 2.")
+      input_size = x_shape[1]
+      w = vs.get_variable("W", [input_size + self._num_units,
+                                self._num_units * 4])
+      b = vs.get_variable("b", [w.get_shape().with_rank(2)[1]],
+                          initializer=init_ops.constant_initializer(0.0))
+      wci = vs.get_variable("wci", [self._num_units])
+      wco = vs.get_variable("wco", [self._num_units])
+      wcf = vs.get_variable("wcf", [self._num_units])
+      (cs_prev, h_prev) = states_prev
+      (_, cs, _, _, _, _, h) = _lstm_fused_cell(x,
+                                                cs_prev,
+                                                h_prev,
+                                                w,
+                                                b,
+                                                wci=wci,
+                                                wco=wco,
+                                                wcf=wcf,
+                                                forget_bias=self._forget_bias,
+                                                use_peephole=self._use_peephole)
+
+      return (h, (cs, h))
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 7d00e73f90a..0ea41e10102 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -27,12 +27,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops.math_ops import reduce_sum
-from tensorflow.python.ops.math_ops import sigmoid
-from tensorflow.python.ops.math_ops import tanh
-from tensorflow.python.ops.nn_ops import conv2d
-from tensorflow.python.ops.nn_ops import softmax
-
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
@@ -104,7 +98,7 @@ class CoupledInputForgetGateLSTMCell(rnn_cell.RNNCell):
                initializer=None, num_proj=None, proj_clip=None,
                num_unit_shards=1, num_proj_shards=1,
                forget_bias=1.0, state_is_tuple=False,
-               activation=tanh):
+               activation=math_ops.tanh):
     """Initialize the parameters for an LSTM cell.
 
     Args:
@@ -188,6 +182,8 @@ class CoupledInputForgetGateLSTMCell(rnn_cell.RNNCell):
       ValueError: If input size cannot be inferred from inputs via
         static shape inference.
     """
+    sigmoid = math_ops.sigmoid
+
     num_proj = self._num_units if self._num_proj is None else self._num_proj
 
     if self._state_is_tuple:
@@ -322,6 +318,8 @@ class TimeFreqLSTMCell(rnn_cell.RNNCell):
       ValueError: if an input_size was specified and the provided inputs have
         a different dimension.
     """
+    sigmoid = math_ops.sigmoid
+    tanh = math_ops.tanh
 
     freq_inputs = self._make_tf_features(inputs)
     dtype = inputs.dtype
@@ -489,6 +487,8 @@ class GridLSTMCell(rnn_cell.RNNCell):
       ValueError: if an input_size was specified and the provided inputs have
         a different dimension.
     """
+    sigmoid = math_ops.sigmoid
+    tanh = math_ops.tanh
 
     freq_inputs = self._make_tf_features(inputs)
     dtype = inputs.dtype
@@ -771,6 +771,11 @@ class AttentionCellWrapper(rnn_cell.RNNCell):
       return output, new_state
 
   def _attention(self, query, attn_states):
+    conv2d = nn_ops.conv2d
+    reduce_sum = math_ops.reduce_sum
+    softmax = nn_ops.softmax
+    tanh = math_ops.tanh
+
     with vs.variable_scope("Attention"):
       k = vs.get_variable("AttnW", [1, 1, self._attn_size, self._attn_vec_size])
       v = vs.get_variable("AttnV", [self._attn_vec_size])
diff --git a/tensorflow/contrib/session_bundle/session_bundle.py b/tensorflow/contrib/session_bundle/session_bundle.py
index 1479db57122..6f895cb2515 100644
--- a/tensorflow/contrib/session_bundle/session_bundle.py
+++ b/tensorflow/contrib/session_bundle/session_bundle.py
@@ -30,10 +30,9 @@ from tensorflow.contrib.session_bundle import constants
 from tensorflow.contrib.session_bundle import manifest_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.lib.io import file_io
-from tensorflow.python.platform import gfile
 
 
-def LoadSessionBundleFromPath(export_dir, target="", config=None):
+def load_session_bundle_from_path(export_dir, target="", config=None):
   """Load session bundle from the given path.
 
   The function reads input from the export_dir, constructs the graph data to the
@@ -55,15 +54,15 @@ def LoadSessionBundleFromPath(export_dir, target="", config=None):
   """
   meta_graph_filename = os.path.join(export_dir,
                                      constants.META_GRAPH_DEF_FILENAME)
-  if not gfile.Exists(meta_graph_filename):
+  if not file_io.file_exists(meta_graph_filename):
     raise RuntimeError("Expected meta graph file missing %s" %
                        meta_graph_filename)
   variables_filename = os.path.join(export_dir,
                                     constants.VARIABLES_FILENAME)
-  if not gfile.Exists(variables_filename):
+  if not file_io.file_exists(variables_filename):
     variables_filename = os.path.join(
         export_dir, constants.VARIABLES_FILENAME_PATTERN)
-    if not gfile.Glob(variables_filename):
+    if not file_io.get_matching_files(variables_filename):
       raise RuntimeError("Expected variables file missing %s" %
                          variables_filename)
   assets_dir = os.path.join(export_dir, constants.ASSETS_DIRECTORY)
diff --git a/tensorflow/contrib/session_bundle/session_bundle_test.py b/tensorflow/contrib/session_bundle/session_bundle_test.py
index a9e157eb196..a080e16d1b4 100644
--- a/tensorflow/contrib/session_bundle/session_bundle_test.py
+++ b/tensorflow/contrib/session_bundle/session_bundle_test.py
@@ -33,7 +33,7 @@ class SessionBundleLoadTest(tf.test.TestCase):
     base_path = tf.test.test_src_dir_path(
         "contrib/session_bundle/example/half_plus_two/00000123")
     tf.reset_default_graph()
-    sess, meta_graph_def = session_bundle.LoadSessionBundleFromPath(
+    sess, meta_graph_def = session_bundle.load_session_bundle_from_path(
         base_path, target="", config=tf.ConfigProto(device_count={"CPU": 2}))
 
     self.assertTrue(sess)
@@ -66,7 +66,7 @@ class SessionBundleLoadTest(tf.test.TestCase):
     base_path = tf.test.test_src_dir_path("/no/such/a/dir")
     tf.reset_default_graph()
     with self.assertRaises(RuntimeError) as cm:
-      _, _ = session_bundle.LoadSessionBundleFromPath(
+      _, _ = session_bundle.load_session_bundle_from_path(
           base_path, target="local",
           config=tf.ConfigProto(device_count={"CPU": 2}))
     self.assertTrue("Expected meta graph file missing" in str(cm.exception))
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 85bf786a66a..79b1a203ca3 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -198,7 +198,7 @@ of the corresponding TF-Slim code:
 
 ```python
 input = ...
-net = slim.conv2d(input, [3, 3], 128, scope='conv1_1')
+net = slim.conv2d(input, 128, [3, 3], scope='conv1_1')
 ```
 
 TF-Slim provides standard implementations for numerous components for building
@@ -431,7 +431,7 @@ between the predicted and true values.
 
 Certain models, such as multi-task
 learning models, require the use of multiple loss functions simultaneously. In
-other words, the loss function ultimatey being minimized is the sum of various
+other words, the loss function ultimately being minimized is the sum of various
 other loss functions. For example, consider a model that predicts both
 the type of scene in an image as well as the depth from the
 camera of each pixel. This model's loss function would be the sum of the
diff --git a/tensorflow/contrib/slim/python/slim/data/README.md b/tensorflow/contrib/slim/python/slim/data/README.md
new file mode 100644
index 00000000000..858c6949902
--- /dev/null
+++ b/tensorflow/contrib/slim/python/slim/data/README.md
@@ -0,0 +1,153 @@
+# TensorFlow-Slim Data
+
+TF-Slim provides a data loading library for facilitating the reading of data
+from various formats. TF-Slim's data modules are composed of several layers of
+abstraction to make it flexible enough to support multiple file storage types,
+such as TFRecords or Text files, data encoding and features naming schemes.
+
+# Overview
+
+The task of loading data has two main components: (1) specification of how
+a dataset is represented so it can be read and interpreted and (2) instruction
+for providing the data to consumers of the dataset.
+
+Secondly, one must specify instructions for how
+the data is actually provided and housed in memory. For example, if the data is
+sharded over many sources, should it be read in parallel from these sources?
+Should it be read serially? Should the data be shuffled in memory?
+
+# Dataset Specification
+
+TF-Slim defines a dataset to be a set of files (that may or may not be encoded)
+representing a finite set of samples, and which can be read to provide a
+predefined set of entities or `items`. For example, a dataset might be stored
+over thousands of files or a single file. The files might store the data in
+clear text or some advanced encoding scheme. It might provide a single `item`,
+like an image, or several `items`, like an image, a class label and a scene
+label.
+
+More concretely, TF-Slim's
+[dataset](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/dataset.py)
+is a tuple that encapsulates the following elements of a dataset specification:
+
+* `data_sources`: A list of file paths that together make up the dataset
+* `reader`: A TensorFlow
+[Reader](https://www.tensorflow.org/api_docs/python/io_ops.html#ReaderBase)
+appropriate for the file type in `data_sources`.
+* `decoder`: A TF-Slim
+[data_decoder](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/data_decoder.py)
+class which is used to decode the content of the read dataset files.
+* `num_samples`: The number of samples in the dataset.
+* `items_to_descriptions`: A map from the items provided by the dataset to
+descriptions of each.
+
+In a nutshell, a dataset is read by (a) opening the files specified by
+`data_sources` using the given `reader` class (b) decoding the files using
+the given `decoder` and (c) allowing the user to request a list of `items` to
+be returned as `Tensors`.
+
+## Data Decoders
+
+A
+[data_decoder](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/data_decoder.py)
+is a class which is given some (possibly serialized/encoded) data and returns a
+list of `Tensors`. In particular, a given data decoder is able to decode a
+predefined list of `items` and can return a subset or all of them, when
+requested:
+
+```python
+# Load the data
+my_encoded_data = ...
+data_decoder = MyDataDecoder()
+
+# Decode the inputs and labels:
+decoded_input, decoded_labels = data_decoder.Decode(data, ['input', 'labels'])
+
+# Decode just the inputs:
+decoded_input = data_decoder.Decode(data, ['input'])
+
+# Check which items a data decoder knows how to decode:
+for item in data_decoder.list_items():
+  print(item)
+```
+
+## Example: TFExampleDataDecoder
+
+The
+[tfexample_data_decoder.py](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/tfexample_data_decoder.py)
+is a data decoder which decodes serialized `TFExample` protocol buffers. A
+`TFExample` protocol buffer is a map from keys (strings) to either a
+`tf.FixedLenFeature` or `tf.VarLenFeature`. Consequently, to decode a
+`TFExample`, one must provide a mapping from one or more `TFExample` fields
+to each of the `items` that the `tfexample_data_decoder` can provide. For
+example, a dataset of `TFExamples` might store images in various formats and
+each `TFExample` might contain an `encoding` key and a `format` key which can
+be used to decode the image using the appropriate decoder (jpg, png, etc).
+
+To make this possible, the `tfexample_data_decoder` is constructed by specifying
+the a map of `TFExample` keys to either `tf.FixedLenFeature` or
+`tf.VarLenFeature` as well as a set of `ItemHandlers`. An `ItemHandler`
+provides a mapping from `TFExample` keys to the item being provided. Because a
+`tfexample_data_decoder` might return multiple `items`, one often constructs a
+`tfexample_data_decoder` using multiple `ItemHandlers`.
+
+`tfexample_data_decoder` provides some predefined `ItemHandlers` which take care
+of the common cases of mapping `TFExamples` to images, `Tensors` and
+`SparseTensors`. For example, the following specification might be
+used to decode a dataset of images:
+
+```python
+keys_to_features = {
+    'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
+    'image/format': tf.FixedLenFeature((), tf.string, default_value='raw'),
+    'image/class/label': tf.FixedLenFeature(
+        [1], tf.int64, default_value=tf.zeros([1], dtype=tf.int64)),
+}
+
+items_to_handlers = {
+    'image': tfexample_decoder.Image(
+      image_key = 'image/encoded',
+      format_key = 'image/format',
+      shape=[28, 28],
+      channels=1),
+    'label': tfexample_decoder.Tensor('image/class/label'),
+}
+
+decoder = tfexample_decoder.TFExampleDecoder(
+    keys_to_features, items_to_handlers)
+```
+
+Notice that the TFExample is parsed using three keys: `image/encoded`,
+`image/format` and `image/class/label`. Additionally, the first two keys are
+mapped to a single `item` named 'image'. As defined, this `data_decoder`
+provides two `items` named 'image' and 'label'.
+
+# Data Provision
+
+A
+[data_provider](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/data_provider.py)
+is a class which provides `Tensors` for each item requested:
+
+```python
+my_data_provider = ...
+image, class_label, bounding_box = my_data_provider.get(
+    ['image', 'label', 'bb'])
+```
+
+The
+[dataset_data_provider](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py)
+is a `data_provider` that provides data from a given `dataset` specification:
+
+```python
+dataset = GetDataset(...)
+data_provider = dataset_data_provider.DatasetDataProvider(
+    dataset, common_queue_capacity=32, common_queue_min=8)
+```
+
+The `dataset_data_provider` enables control over several elements of data
+provision:
+
+* How many concurrent readers are used.
+* Whether the data is shuffled as its loaded into its queue
+* Whether to take a single pass over the data or read data indefinitely.
+
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index cd052576044..d768722cd8a 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -1,4 +1,4 @@
-# Copyright 2016 Google Inc. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -97,49 +97,57 @@ class ItemHandlerCallback(ItemHandler):
 
 
 class Tensor(ItemHandler):
-  """An ItemHandler that returns a parsed Tensor or SparseTensor."""
+  """An ItemHandler that returns a parsed Tensor."""
 
-  def __init__(self, tensor_key, shape_key=None, shape=None, default_value=0):
+  def __init__(self, tensor_key, shape_keys=None, shape=None, default_value=0):
     """Initializes the Tensor handler.
 
     Tensors are, by default, returned without any reshaping. However, there are
-    two mechanisms which allow reshaping to occur at load time. If `shape_key`
-    is provided, both the `Tensor` corresponding to `tensor_key` and `shape_key`
-    is loaded and the former `Tensor` is reshaped with the values of the latter.
-    Alternatively, if a fixed `shape` is provided, the `Tensor` corresponding to
-    `tensor_key` is loaded and reshape appropriately. If neither `shape_key` nor
-    `shape` are provided, the `Tensor` will be returned without any reshaping.
+    two mechanisms which allow reshaping to occur at load time. If `shape_keys`
+    is provided, both the `Tensor` corresponding to `tensor_key` and
+    `shape_keys` is loaded and the former `Tensor` is reshaped with the values
+    of the latter. Alternatively, if a fixed `shape` is provided, the `Tensor`
+    corresponding to `tensor_key` is loaded and reshape appropriately.
+    If neither `shape_keys` nor `shape` are provided, the `Tensor` will be
+    returned without any reshaping.
 
     Args:
       tensor_key: the name of the `TFExample` feature to read the tensor from.
-      shape_key: Optional name of the TF-Example feature in which the tensor
-        shape is stored.
-      shape: Optional output shape of the Tensor. If provided, the `Tensor` is
+      shape_keys: Optional name or list of names of the TF-Example feature in
+        which the tensor shape is stored. If a list, then each corresponds to
+        one dimension of the shape.
+      shape: Optional output shape of the `Tensor`. If provided, the `Tensor` is
         reshaped accordingly.
-      default_value: Scalar value to set when making dense for indices not
-        specified in the `SparseTensor`.
+      default_value: The value used when the `tensor_key` is not found in a
+        particular `TFExample`.
 
     Raises:
-      ValueError: if both `shape_key` and `shape` are specified.
+      ValueError: if both `shape_keys` and `shape` are specified.
     """
-    if shape_key and shape is not None:
-      raise ValueError('Cannot specify both shape_key and shape parameters.')
+    if shape_keys and shape is not None:
+      raise ValueError('Cannot specify both shape_keys and shape parameters.')
+    if shape_keys and not isinstance(shape_keys, list):
+      shape_keys = [shape_keys]
     self._tensor_key = tensor_key
-    self._shape_key = shape_key
+    self._shape_keys = shape_keys
     self._shape = shape
     self._default_value = default_value
     keys = [tensor_key]
-    if shape_key:
-      keys.append(shape_key)
+    if shape_keys:
+      keys.extend(shape_keys)
     super(Tensor, self).__init__(keys)
 
   def tensors_to_item(self, keys_to_tensors):
     tensor = keys_to_tensors[self._tensor_key]
     shape = self._shape
-    if self._shape_key:
-      shape = keys_to_tensors[self._shape_key]
-      if isinstance(shape, ops.SparseTensor):
-        shape = sparse_ops.sparse_tensor_to_dense(shape)
+    if self._shape_keys:
+      shape_dims = []
+      for k in self._shape_keys:
+        shape_dim = keys_to_tensors[k]
+        if isinstance(shape_dim, ops.SparseTensor):
+          shape_dim = sparse_ops.sparse_tensor_to_dense(shape_dim)
+        shape_dims.append(shape_dim)
+      shape = array_ops.squeeze(array_ops.pack(shape_dims))
     if isinstance(tensor, ops.SparseTensor):
       if shape is not None:
         tensor = sparse_ops.sparse_reshape(tensor, shape)
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
index 7fd5ac6646a..7f0dd30ed9f 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
@@ -315,9 +315,50 @@ class TFExampleDecoderTest(tf.test.TestCase):
       }
       items_to_handlers = {
           'image': slim.tfexample_decoder.Tensor('image',
-                                                 shape_key='image/shape'),
+                                                 shape_keys='image/shape'),
           'labels': slim.tfexample_decoder.Tensor('labels',
-                                                  shape_key='labels/shape'),
+                                                  shape_keys='labels/shape'),
+      }
+      decoder = slim.tfexample_decoder.TFExampleDecoder(
+          keys_to_features, items_to_handlers)
+      [tf_image, tf_labels] = decoder.decode(serialized_example,
+                                             ['image', 'labels'])
+      self.assertAllEqual(tf_image.eval(), np_image)
+      self.assertAllEqual(tf_labels.eval(), np_labels)
+
+  def testDecodeExampleMultiShapeKeyTensor(self):
+    np_image = np.random.rand(2, 3, 1).astype('f')
+    np_labels = np.array([[[1], [2], [3]],
+                          [[4], [5], [6]]])
+    height, width, depth = np_labels.shape
+
+    example = tf.train.Example(features=tf.train.Features(feature={
+        'image': self._EncodedFloatFeature(np_image),
+        'image/shape': self._EncodedInt64Feature(np.array(np_image.shape)),
+        'labels': self._EncodedInt64Feature(np_labels),
+        'labels/height': self._EncodedInt64Feature(np.array([height])),
+        'labels/width': self._EncodedInt64Feature(np.array([width])),
+        'labels/depth': self._EncodedInt64Feature(np.array([depth])),
+    }))
+
+    serialized_example = example.SerializeToString()
+
+    with self.test_session():
+      serialized_example = tf.reshape(serialized_example, shape=[])
+      keys_to_features = {
+          'image': tf.VarLenFeature(dtype=tf.float32),
+          'image/shape': tf.VarLenFeature(dtype=tf.int64),
+          'labels': tf.VarLenFeature(dtype=tf.int64),
+          'labels/height': tf.VarLenFeature(dtype=tf.int64),
+          'labels/width': tf.VarLenFeature(dtype=tf.int64),
+          'labels/depth': tf.VarLenFeature(dtype=tf.int64),
+      }
+      items_to_handlers = {
+          'image': slim.tfexample_decoder.Tensor(
+              'image', shape_keys='image/shape'),
+          'labels': slim.tfexample_decoder.Tensor(
+              'labels',
+              shape_keys=['labels/height', 'labels/width', 'labels/depth']),
       }
       decoder = slim.tfexample_decoder.TFExampleDecoder(
           keys_to_features, items_to_handlers)
diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py
index e6314a9ce9c..433e4ae61f0 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@@ -253,7 +253,8 @@ def evaluation_loop(master,
                     summary_op_feed_dict=None,
                     variables_to_restore=None,
                     eval_interval_secs=60,
-                    max_number_of_evaluations=None):
+                    max_number_of_evaluations=None,
+                    session_config=None):
   """Runs TF-Slim's Evaluation Loop.
 
   Args:
@@ -276,6 +277,8 @@ def evaluation_loop(master,
     eval_interval_secs: The minimum number of seconds between evaluations.
     max_number_of_evaluations: the max number of iterations of the evaluation.
       If the value is left as 'None', the evaluation continues indefinitely.
+    session_config: An instance of `tf.ConfigProto` that will be used to
+      configure the `Session`. If left as `None`, the default will be used.
   """
   if summary_op == _USE_DEFAULT:
     summary_op = logging_ops.merge_all_summaries()
@@ -307,7 +310,8 @@ def evaluation_loop(master,
     logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
                                                            time.gmtime()))
 
-    with sv.managed_session(master, start_standard_services=False) as sess:
+    with sv.managed_session(
+        master, start_standard_services=False, config=session_config) as sess:
       sv.saver.restore(sess, last_checkpoint)
       sv.start_queue_runners(sess)
       evaluation(sess,
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index ccb26bba47b..c6312e4a001 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -538,12 +538,14 @@ def train(
     init_feed_dict=None,
     local_init_op=None,
     init_fn=None,
+    ready_op=_USE_DEFAULT,
     summary_op=_USE_DEFAULT,
     save_summaries_secs=600,
     startup_delay_steps=0,
     saver=None,
     save_interval_secs=600,
-    sync_optimizer=None):
+    sync_optimizer=None,
+    session_config=None):
   """Runs a training loop using a TensorFlow supervisor.
 
   When the sync_optimizer is supplied, gradient updates are applied
@@ -579,6 +581,9 @@ def train(
       `tf.initialize_local_variables()` and `tf.initialize_all_tables()`.
     init_fn: An optional callable to be executed after `init_op` is called. The
       callable must accept one argument, the session being initialized.
+    ready_op: Operation to check if the model is ready to use. If left to its
+      default value, then the session checks for readiness by calling
+      `tf.report_uninitialized_variables()`.
     summary_op: The summary operation.
     save_summaries_secs: How often, in seconds, to save summaries.
     startup_delay_steps: The number of steps to wait for before beginning. Note
@@ -589,6 +594,8 @@ def train(
     sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
       argument is supplied, gradient updates will be synchronous. If left as
       `None`, gradient updates will be asynchronous.
+    session_config: An instance of `tf.ConfigProto` that will be used to
+      configure the `Session`. If left as `None`, the default will be used.
 
   Returns:
     the value of the loss function after training.
@@ -624,6 +631,9 @@ def train(
     if init_op == _USE_DEFAULT:
       init_op = tf_variables.initialize_all_variables()
 
+    if ready_op == _USE_DEFAULT:
+      ready_op = tf_variables.report_uninitialized_variables()
+
     if summary_op == _USE_DEFAULT:
       summary_op = logging_ops.merge_all_summaries()
 
@@ -660,6 +670,7 @@ def train(
       init_op=init_op,
       init_feed_dict=init_feed_dict,
       local_init_op=local_init_op,
+      ready_op=ready_op,
       summary_op=summary_op,
       global_step=global_step,
       saver=saver,
@@ -671,7 +682,8 @@ def train(
   while should_retry:
     try:
       should_retry = False
-      with sv.managed_session(master, start_standard_services=False) as sess:
+      with sv.managed_session(
+          master, start_standard_services=False, config=session_config) as sess:
         logging.info('Starting Session.')
         if is_chief:
           if logdir:
@@ -694,10 +706,11 @@ def train(
           if logdir and sv.is_chief:
             logging.info('Finished training! Saving model to disk.')
             sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
-        finally:
+        except:
           if sv.is_chief and cleanup_op is not None:
             logging.info('About to execute sync_clean_up_op!')
             sess.run(cleanup_op)
+          raise
 
     except errors.AbortedError:
       # Always re-run on AbortedError as it indicates a restart of one of the
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index b57c8f8fe63..4b7e42ceb24 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -362,6 +362,32 @@ class TrainTest(tf.test.TestCase):
     self.assertIsNotNone(loss)
     self.assertLess(loss, .015)
 
+  def testTrainWithSessionConfig(self):
+    g = tf.Graph()
+    with g.as_default():
+      tf.set_random_seed(0)
+      tf_inputs = tf.constant(self._inputs, dtype=tf.float32)
+      tf_labels = tf.constant(self._labels, dtype=tf.float32)
+
+      tf_predictions = LogisticClassifier(tf_inputs)
+      slim.losses.log_loss(tf_predictions, tf_labels)
+      total_loss = slim.losses.get_total_loss()
+
+      optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
+
+      train_op = slim.learning.create_train_op(total_loss, optimizer)
+
+    session_config = tf.ConfigProto(allow_soft_placement=True)
+    loss = slim.learning.train(
+        train_op,
+        None,
+        number_of_steps=300,
+        log_every_n_steps=10,
+        graph=g,
+        session_config=session_config)
+    self.assertIsNotNone(loss)
+    self.assertLess(loss, .015)
+
   def testTrainWithNoneAsLogdirWhenUsingSummariesRaisesError(self):
     with tf.Graph().as_default():
       tf.set_random_seed(0)
diff --git a/tensorflow/contrib/tensor_forest/data/data_ops.py b/tensorflow/contrib/tensor_forest/data/data_ops.py
index 28422f30ba3..e0e6be09e33 100644
--- a/tensorflow/contrib/tensor_forest/data/data_ops.py
+++ b/tensorflow/contrib/tensor_forest/data/data_ops.py
@@ -131,7 +131,7 @@ def _ParseSparse(data):
     offset += offset_increment
 
   return (sparse_ops.sparse_concat(1, sparse_tensors),
-          [constants.DATA_ALL_CATEGORICAL])
+          [constants.DATA_FLOAT])
 
 
 def _ParseDense(data):
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 09203794032..e0a396fb9a5 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -135,6 +135,7 @@ cc_library(
         "lib/core/coding.h",
         "lib/core/errors.h",
         "lib/core/notification.h",
+        "lib/core/raw_coding.h",
         "lib/core/status.h",
         "lib/core/stringpiece.h",
         "lib/core/threadpool.h",
@@ -155,6 +156,8 @@ cc_library(
         "lib/io/table_options.h",
         "lib/jpeg/jpeg_mem.h",
         "lib/monitoring/counter.h",
+        "lib/monitoring/export_registry.h",
+        "lib/monitoring/metric_def.h",
         "lib/random/distribution_sampler.h",
         "lib/random/philox_random.h",
         "lib/random/simple_philox.h",  # TODO(josh11b): make internal
@@ -170,6 +173,7 @@ cc_library(
         "platform/logging.h",
         "platform/macros.h",
         "platform/mem.h",
+        "platform/net.h",
         "platform/mutex.h",
         "platform/protobuf.h",  # TODO(josh11b): make internal
         "platform/regexp.h",
@@ -422,9 +426,6 @@ tf_cuda_library(
         "graph/validate.h",
         "public/session.h",
         "public/session_options.h",
-        "public/tensor_c_api.h",
-        "util/checkpoint_reader.h",
-        "util/tf_status_helper.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -571,6 +572,7 @@ filegroup(
     name = "android_srcs",
     srcs = [
         ":proto_text_srcs_all",
+        "//tensorflow/core/debug:android_srcs",
         "//tensorflow/core/kernels:android_srcs",
         "//tensorflow/core/platform/default/build_config:android_srcs",
         "//tensorflow/core/util/ctc:android_srcs",
@@ -581,8 +583,6 @@ filegroup(
             "client/**/*.cc",
             "common_runtime/**/*.h",
             "common_runtime/**/*.cc",
-            "debug/**/*.h",
-            "debug/**/*.cc",
             "framework/**/*.h",
             "framework/**/*.cc",
             "graph/**/*.h",
@@ -604,10 +604,8 @@ filegroup(
             "lib/jpeg/**/*",
             "lib/png/**/*",
             "lib/gif/**/*",
-            "util/checkpoint_reader.*",
             "util/events_writer.*",
             "util/reporter.*",
-            "util/tf_status_helper.*",
             "platform/default/stream_executor.*",
             "platform/default/test_benchmark.*",
             "platform/cuda.h",
@@ -916,8 +914,6 @@ tf_cuda_library(
             "**/*test*",
             "**/*main.cc",
             "example/example_parser_configuration.*",
-            "util/tf_status_helper.*",
-            "util/checkpoint_reader.*",
             "util/reporter.h",
             "util/reporter.cc",
             "framework/fake_input.*",
@@ -1015,6 +1011,7 @@ filegroup(
         "platform/default/protobuf.h",
         "platform/default/thread_annotations.h",
         "platform/env.h",
+        "platform/file_statistics.h",
         "platform/file_system.h",
         "platform/fingerprint.h",
         "platform/host_info.h",
@@ -1022,6 +1019,7 @@ filegroup(
         "platform/macros.h",
         "platform/mem.h",
         "platform/mutex.h",
+        "platform/net.h",
         "platform/platform.h",
         "platform/protobuf.h",
         "platform/strong_hash.h",
@@ -1062,10 +1060,7 @@ tf_cuda_library(
             "graph/**/*.cc",
             "public/session.h",
             "public/session_options.h",
-            "public/tensor_c_api.h",
             "public/version.h",
-            "util/tf_status_helper.*",
-            "util/checkpoint_reader.*",
         ],
         exclude = [
             "**/*test*",
@@ -1111,49 +1106,13 @@ tf_cuda_library(
     linkstatic = 1,
     deps = [
         ":core_cpu_internal",
-        ":debug_graph_utils",
         ":framework",
         ":gpu_tracer",
         ":lib",
         ":lib_internal",
         ":proto_text",
         ":protos_all_cc",
-    ],
-    alwayslink = 1,
-)
-
-tf_cuda_library(
-    name = "debug_gateway_internal",
-    srcs = ["debug/debug_gateway.cc"],
-    hdrs = ["debug/debug_gateway.h"],
-    copts = tf_copts(),
-    linkstatic = 1,
-    deps = [
-        ":core_cpu_internal",
-        ":direct_session_internal",
-        ":framework",
-        ":gpu_tracer",
-        ":lib",
-        ":lib_internal",
-        ":proto_text",
-        ":protos_all_cc",
-    ],
-    alwayslink = 1,
-)
-
-tf_cuda_library(
-    name = "debug_graph_utils",
-    srcs = ["debug/debug_graph_utils.cc"],
-    hdrs = ["debug/debug_graph_utils.h"],
-    copts = tf_copts(),
-    linkstatic = 1,
-    deps = [
-        ":core_cpu_internal",
-        ":framework",
-        ":lib",
-        ":lib_internal",
-        ":proto_text",
-        ":protos_all_cc",
+        "//tensorflow/core/debug:debug_graph_utils",
     ],
     alwayslink = 1,
 )
@@ -1325,6 +1284,8 @@ tf_cc_tests(
         "lib/io/table_test.cc",
         "lib/io/zlib_buffers_test.cc",
         "lib/monitoring/counter_test.cc",
+        "lib/monitoring/export_registry_test.cc",
+        "lib/monitoring/metric_def_test.cc",
         "lib/random/distribution_sampler_test.cc",
         "lib/random/philox_random_test.cc",
         "lib/random/random_distributions_test.cc",
@@ -1336,9 +1297,10 @@ tf_cc_tests(
         "lib/strings/strcat_test.cc",
         "lib/strings/stringprintf_test.cc",
         "lib/wav/wav_io_test.cc",
-        "platform/env_test.cc",
+        "platform/fingerprint_test.cc",
         "platform/integral_types_test.cc",
         "platform/logging_test.cc",
+        "platform/net_test.cc",
         "platform/port_test.cc",
     ],
     deps = [
@@ -1352,6 +1314,21 @@ tf_cc_tests(
     ],
 )
 
+tf_cc_tests(
+    size = "small",
+    tags = ["manual"],  # http://b/30439755
+    tests = ["platform/env_test.cc"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_test(
     name = "lib_jpeg_jpeg_mem_unittest",
     srcs = ["lib/jpeg/jpeg_mem_unittest.cc"],
@@ -1481,36 +1458,6 @@ tf_cc_tests(
     ],
 )
 
-tf_cc_tests(
-    size = "small",
-    linkopts = select({
-        "//tensorflow:darwin": ["-headerpad_max_install_names"],
-        "//conditions:default": [],
-    }),
-    linkstatic = tf_kernel_tests_linkstatic(),
-    tests = [
-        "client/tensor_c_api_test.cc",
-    ],
-    deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":direct_session_internal",
-        ":framework",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":proto_text",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/core/kernels:array",
-        "//tensorflow/core/kernels:math",
-        "//third_party/eigen3",
-    ],
-)
-
 # GPU-related tests
 tf_cc_tests_gpu(
     size = "small",
@@ -1627,35 +1574,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test_gpu(
-    name = "debug/debug_gateway_test",
-    size = "small",
-    args = ["--heap_check=local"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags() + ["nomac"],
-    deps = [
-        ":all_kernels",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":debug_gateway_internal",
-        ":debug_graph_utils",
-        ":direct_session",
-        ":direct_session_internal",
-        ":framework",
-        ":framework_internal",
-        ":gpu_runtime",
-        ":lib",
-        ":lib_internal",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core/kernels:debug_ops",
-        "//tensorflow/core/kernels:ops_util",
-    ],
-)
-
 tf_cc_test(
     name = "common_runtime/direct_session_with_tracking_alloc_test",
     size = "small",
@@ -1867,6 +1785,7 @@ tf_cc_tests(
     tests = [
         "ops/array_ops_test.cc",
         "ops/candidate_sampling_ops_test.cc",
+        "ops/control_flow_ops_test.cc",
         "ops/ctc_ops_test.cc",
         "ops/data_flow_ops_test.cc",
         "ops/functional_ops_test.cc",
@@ -1876,7 +1795,9 @@ tf_cc_tests(
         "ops/math_ops_test.cc",
         "ops/nn_ops_test.cc",
         "ops/parsing_ops_test.cc",
+        "ops/random_ops_test.cc",
         "ops/sparse_ops_test.cc",
+        "ops/state_ops_test.cc",
         "ops/string_ops_test.cc",
         "ops/training_ops_test.cc",
     ],
@@ -1983,6 +1904,7 @@ filegroup(
         "example/testdata/parse_example_graph_def.pbtxt",
     ],
 )
+
 # -----------------------------------------------------------------------------
 # Google-internal targets go here (must be at the end).
 
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 03b93cf9a98..234069a6268 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -39,7 +39,8 @@ namespace tensorflow {
 
 namespace {
 
-bool IsConstantFoldable(const Node* n,
+bool IsConstantFoldable(const FunctionLibraryDefinition* flib_def,
+                        const Node* n,
                         std::function<bool(const Node*)> consider) {
   if (n->op_def().is_stateful()) {
     return false;
@@ -61,18 +62,28 @@ bool IsConstantFoldable(const Node* n,
   if (n->IsSink()) {
     return false;
   }
+  // For now, don't try to constant-fold functions. (They may be inlined, in
+  // which case they will become subject to constant-folding again.)
+  // TODO(phawkins): support constant-folding for functions; functions may
+  // be arbitrarily expensive to execute.
+  if (flib_def && flib_def->Find(n->type_string())) {
+    return false;
+  }
   return true;
 }
 
 // Returns the constant foldable nodes in `nodes_result` in data flow order.
-void FindConstantFoldableNodes(const Graph* graph, ConstantFoldingOptions opts,
+void FindConstantFoldableNodes(const Graph* graph,
+                               const FunctionLibraryDefinition* flib_def,
+                               ConstantFoldingOptions opts,
                                std::vector<Node*>* nodes_result) {
   std::set<const Node*> node_set;
   std::vector<Node*>& nodes = *nodes_result;
   bool internal_node_inserted = false;
   // Walk the nodes in data flow order
   ReverseDFS(*graph, nullptr,
-             [&nodes, &node_set, &internal_node_inserted, opts](Node* n) {
+             [&nodes, &node_set, &internal_node_inserted, opts,
+              flib_def](Node* n) {
                if (n->IsConstant()) {
                  // Constants with no control inputs (except from _SOURCE node)
                  // are definitely constant foldable.
@@ -82,7 +93,7 @@ void FindConstantFoldableNodes(const Graph* graph, ConstantFoldingOptions opts,
                    node_set.insert(n);
                    nodes.push_back(n);
                  }
-               } else if (IsConstantFoldable(n, opts.consider)) {
+               } else if (IsConstantFoldable(flib_def, n, opts.consider)) {
                  // Check whether the set of this node's in_nodes is completely
                  // included in the set of constant foldable nodes. If true,
                  // then this node is also constant foldable.
@@ -303,6 +314,7 @@ bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
 }
 
 bool DoConstantFolding(const ConstantFoldingOptions& opts,
+                       FunctionLibraryRuntime* function_library,
                        Device* partition_device, Graph* graph) {
   DumpGraph("Before", graph);
   Device* device = GetCPUDevice();
@@ -313,8 +325,12 @@ bool DoConstantFolding(const ConstantFoldingOptions& opts,
     return false;
   }
 
+  const FunctionLibraryDefinition* flib_def = nullptr;
+  if (function_library) {
+    flib_def = function_library->GetFunctionLibraryDefinition();
+  }
   std::vector<Node*> constant_foldable_nodes;
-  FindConstantFoldableNodes(graph, opts, &constant_foldable_nodes);
+  FindConstantFoldableNodes(graph, flib_def, opts, &constant_foldable_nodes);
   if (constant_foldable_nodes.empty()) {
     VLOG(1) << "No constant foldable nodes found";
     return false;
diff --git a/tensorflow/core/common_runtime/constant_folding.h b/tensorflow/core/common_runtime/constant_folding.h
index e0bc868bc63..f354aedc592 100644
--- a/tensorflow/core/common_runtime/constant_folding.h
+++ b/tensorflow/core/common_runtime/constant_folding.h
@@ -31,6 +31,7 @@ namespace tensorflow {
 // assumed to execute.
 // Returns true if and only if "graph" has been mutated.
 bool DoConstantFolding(const ConstantFoldingOptions& opts,
+                       FunctionLibraryRuntime* function_library,
                        Device* partition_device, Graph* graph);
 
 typedef std::pair<Node*, int> NodeAndOutput;
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index 704e9fb2fb0..946b939e9a1 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -108,7 +109,7 @@ class ConstantFoldingTest : public ::testing::Test {
 
 TEST_F(ConstantFoldingTest, Basic) {
   SIMPLE_GRAPH;
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, g));
+  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, nullptr, g));
 
   // Nodes s1 and s2 now should now have a constant input
   EXPECT_EQ(1, s1->num_inputs());
@@ -124,7 +125,7 @@ TEST_F(ConstantFoldingTest, ConsiderFunction) {
   ConstantFoldingOptions opts;
   // Do not allow constant folding of m2
   opts.consider = [m2](const Node* n) { return m2 != n; };
-  EXPECT_TRUE(DoConstantFolding(opts, nullptr, g));
+  EXPECT_TRUE(DoConstantFolding(opts, nullptr, nullptr, g));
 
   // Node s1 now should now have a constant input
   EXPECT_EQ(1, s1->num_inputs());
@@ -141,7 +142,7 @@ TEST_F(ConstantFoldingTest, TestNoReplaceAnotherConstant) {
   g->AddControlEdge(g->source_node(), d);
   Node* s3 = test::graph::Send(g, d, "d", "sender", 0, "receiver");
   g->AddControlEdge(s3, g->sink_node());
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, g));
+  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, nullptr, g));
 
   // Nodes s3 should still have d as input
   EXPECT_EQ(1, s3->num_inputs());
@@ -167,7 +168,7 @@ TEST_F(ConstantFoldingTest, TwoOutputs) {
   g->AddControlEdge(b0, g->sink_node());
   g->AddControlEdge(b1, g->sink_node());
 
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, g));
+  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, nullptr, g));
   EXPECT_EQ(1, b0->num_inputs());
   ExpectNodeEqual<int>(*(b0->in_nodes().begin()), {0, 1}, {2});
   EXPECT_EQ(1, b1->num_inputs());
@@ -193,7 +194,7 @@ TEST_F(ConstantFoldingTest, TwoOutputsFoldOneOutput) {
 
   ConstantFoldingOptions opts;
   opts.consider = [b1_ident](const Node* n) { return b1_ident != n; };
-  EXPECT_TRUE(DoConstantFolding(opts, nullptr, g));
+  EXPECT_TRUE(DoConstantFolding(opts, nullptr, nullptr, g));
   // 0th output of b should have been folded.
   EXPECT_EQ(1, b0->num_inputs());
   ExpectNodeEqual<int>(*(b0->in_nodes().begin()), {0, 1}, {2});
@@ -229,11 +230,11 @@ TEST_F(ConstantFoldingTest, TestNoReplaceOnGPU) {
   g->AddControlEdge(send, g->sink_node());
 
   // No ops should be replaced, as there is no kernel for BFLOAT16 on GPU.
-  EXPECT_FALSE(DoConstantFolding(ConstantFoldingOptions{}, device, g));
+  EXPECT_FALSE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, device, g));
 
   // But constant folding should have replaced the cast op with a constant when
   // running on CPU.
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, g));
+  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, nullptr, g));
 
   for (auto d : devices) {
     delete d;
@@ -258,7 +259,39 @@ TEST_F(ConstantFoldingTest, TestNoReplaceLargeConstant) {
   g->AddControlEdge(concat_send, g->sink_node());
 
   // The above concat should not have been constant folded.
-  EXPECT_FALSE(DoConstantFolding(ConstantFoldingOptions{}, nullptr, g));
+  EXPECT_FALSE(
+      DoConstantFolding(ConstantFoldingOptions{}, nullptr, nullptr, g));
+}
+
+TEST_F(ConstantFoldingTest, TestNoReplaceFunctionCall) {
+  FunctionDefLibrary fdef_lib;
+  *fdef_lib.add_function() = test::function::XTimesTwo();
+
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), fdef_lib);
+  g_.reset(new Graph(&flib_def));
+
+  Graph* g = g_.get();
+  Node* s =
+      Constant<int>(std::vector<int>(5 * 1024 * 256, 0), {5 * 1024 * 256});
+  g->AddControlEdge(g->source_node(), s);
+
+  NodeDef def;
+  TF_ASSERT_OK(NodeDefBuilder("times_two", "XTimesTwo", g->op_registry())
+                   .Input(s->name(), 0, DT_INT32)
+                   .Finalize(&def));
+  Status status;
+  Node* times_two = g->AddNode(def, &status);
+  TF_ASSERT_OK(status);
+
+  Node* times_two_send = test::graph::Send(g, times_two, "times_two_send",
+                                           "sender", 0, "receiver");
+  g->AddControlEdge(times_two_send, g->sink_node());
+
+  // The above function call should not have been constant folded.
+  EXPECT_FALSE(
+      DoConstantFolding(ConstantFoldingOptions{}, nullptr, nullptr, g));
+
+  g_ = nullptr;
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 8621118bda0..9e20aee879a 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -730,12 +730,14 @@ Status DirectSession::GetOrCreateExecutors(
   options.fetch_endpoints = outputs_sorted;
   options.target_nodes = tn_sorted;
 
+  std::unique_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
+
   // The executor_lock_ is intentionally released while executor is
   // being created.
   std::unordered_map<string, std::unique_ptr<Graph>> graphs;
-  TF_RETURN_IF_ERROR(CreateGraphs(options, &graphs, run_state_args));
+  TF_RETURN_IF_ERROR(
+      CreateGraphs(options, &graphs, &ek->flib_def, run_state_args));
 
-  std::unique_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
   if (run_state_args->is_partial_run) {
     ek->graph = std::move(run_state_args->graph);
     std::unordered_set<StringPiece, StringPiece::Hasher> names;
@@ -769,7 +771,7 @@ Status DirectSession::GetOrCreateExecutors(
     auto* item = &(ek->items.back());
     item->flib.reset(
         NewFunctionLibraryRuntime(device_mgr_.get(), device, graph_def_version,
-                                  flib_def_.get(), optimizer_opts));
+                                  ek->flib_def.get(), optimizer_opts));
 
     LocalExecutorParams params;
     params.device = device;
@@ -848,6 +850,7 @@ Status DirectSession::GetOrCreateExecutors(
 Status DirectSession::CreateGraphs(
     const BuildGraphOptions& options,
     std::unordered_map<string, std::unique_ptr<Graph>>* outputs,
+    std::unique_ptr<FunctionLibraryDefinition>* flib_def,
     RunStateArgs* run_state_args) {
   mutex_lock l(graph_def_lock_);
   std::unique_ptr<SimpleClientGraph> client_graph;
@@ -964,7 +967,8 @@ Status DirectSession::CreateGraphs(
     if (!s.ok()) {
       break;
     }
-    std::unique_ptr<Graph> device_graph(new Graph(flib_def_.get()));
+    std::unique_ptr<Graph> device_graph(
+        new Graph(client_graph->flib_def.get()));
     GraphConstructorOptions device_opts;
     // There are internal operations (e.g., send/recv) that we now
     // allow.
@@ -974,6 +978,7 @@ Status DirectSession::CreateGraphs(
         ConvertGraphDefToGraph(device_opts, *graph_def, device_graph.get()));
     outputs->emplace(partition_name, std::move(device_graph));
   }
+  *flib_def = std::move(client_graph->flib_def);
   return s;
 }
 
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 21d5d9e5e2f..1b748954470 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -108,10 +108,15 @@ class DirectSession : public Session {
   // a partition of the graph bundled with its dependent library runtime.
   // 'input_keys' are the rendezvous keys for the feeds and 'output_keys'
   // are rendezvous keys for the fetches.
+  // 'flib_def' is the function library used by graphs in 'items'.
+  // TODO(phawkins): currently partitions always share the same function
+  // library. Consider giving each partition its own function library to enable
+  // per-partition rewrites.
   struct ExecutorsAndKeys {
     int64 step_count = 0;
     std::unique_ptr<Graph> graph;
     NameNodeMap name_to_node;
+    std::unique_ptr<FunctionLibraryDefinition> flib_def;
     std::vector<PerPartitionExecutorsAndLib> items;
     std::unordered_map<string, string> input_keys;
     std::unordered_map<string, string> output_keys;
@@ -157,10 +162,12 @@ class DirectSession : public Session {
       ExecutorsAndKeys** executors_and_keys, RunStateArgs* run_state_args);
 
   // Creates several graphs given the existing graph_def_ and the
-  // input feeds and fetches, given 'devices'.
+  // input feeds and fetches, given 'devices'. The graphs share a common
+  // function library 'flib_def'.
   ::tensorflow::Status CreateGraphs(
       const BuildGraphOptions& options,
       std::unordered_map<string, std::unique_ptr<Graph>>* outputs,
+      std::unique_ptr<FunctionLibraryDefinition>* flib_def,
       RunStateArgs* run_state_args);
 
   ::tensorflow::Status ExtendLocked(const GraphDef& graph)
@@ -237,6 +244,10 @@ class DirectSession : public Session {
   // Execution_state; used when placing the entire graph.
   std::unique_ptr<SimpleGraphExecutionState> execution_state_
       GUARDED_BY(graph_def_lock_);
+
+  // The function library, before any rewrites or optimizations have been
+  // performed. In particular, CreateGraphs() may need to modify the function
+  // library; it copies and modifies the function library.
   std::unique_ptr<FunctionLibraryDefinition> flib_def_;
 
   // For generating unique names.
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 230d26b8a09..197ac462b4f 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -853,9 +853,6 @@ class ExecutorState {
   bool NodeDone(const Status& s, const Node* node, const TaggedNodeSeq& ready,
                 NodeExecStats* stats, TaggedNodeReadyQueue* inline_ready);
 
-  // Call Process() on all nodes in 'inline_ready'.
-  void ProcessInline(const TaggedNodeReadyQueue& inline_ready);
-
   // Schedule all the expensive nodes in 'ready', and put all the inexpensive
   // nodes in 'ready' into 'inline_ready'.
   void ScheduleReady(const TaggedNodeSeq& ready,
@@ -1654,17 +1651,6 @@ bool ExecutorState::NodeDone(const Status& s, const Node* node,
   return completed;
 }
 
-void ExecutorState::ProcessInline(const TaggedNodeReadyQueue& inline_ready) {
-  if (inline_ready.empty()) return;
-  int64 scheduled_usec = 0;
-  if (stats_collector_) {
-    scheduled_usec = nodestats::NowInUsec();
-  }
-  for (auto& tagged_node : inline_ready) {
-    Process(tagged_node, scheduled_usec);
-  }
-}
-
 void ExecutorState::ScheduleReady(const TaggedNodeSeq& ready,
                                   TaggedNodeReadyQueue* inline_ready) {
   if (ready.empty()) return;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 1ccf66ed346..fc859426b57 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -219,10 +219,34 @@ class PassOn : public OpKernel {
     }
   }
 };
+
 REGISTER_KERNEL_BUILDER(Name("_ListToArray").Device(DEVICE_CPU), PassOn);
-REGISTER_KERNEL_BUILDER(Name("_ListToArray").Device(DEVICE_GPU), PassOn);
 REGISTER_KERNEL_BUILDER(Name("_ArrayToList").Device(DEVICE_CPU), PassOn);
-REGISTER_KERNEL_BUILDER(Name("_ArrayToList").Device(DEVICE_GPU), PassOn);
+
+#define REGISTER_GPU_KERNELS(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_ListToArray").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      PassOn);                                                           \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_ArrayToList").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      PassOn);
+
+REGISTER_GPU_KERNELS(Eigen::half);
+REGISTER_GPU_KERNELS(float);
+REGISTER_GPU_KERNELS(double);
+
+#undef REGISTER_GPU_KERNELS
+
+REGISTER_KERNEL_BUILDER(Name("_ListToArray")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        PassOn);
+REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("input")
+                            .TypeConstraint<int32>("T"),
+                        PassOn);
 
 static const FunctionLibraryRuntime::Handle kInvalidHandle = -1;
 
@@ -248,6 +272,11 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   bool IsStateful(const string& function) override;
 
+  const FunctionLibraryDefinition* GetFunctionLibraryDefinition()
+      const override {
+    return lib_def_;
+  }
+
   Device* device() override { return device_; }
 
  private:
@@ -383,23 +412,22 @@ class SymbolicGradientOp : public AsyncOpKernel {
       args.push_back(ctx->input(i));
     }
     std::vector<Tensor>* rets = new std::vector<Tensor>;
-    lib->Run(opts, handle_, args, rets,
-             [ctx, done, rets](const Status& status) {
-               if (!status.ok()) {
-                 ctx->SetStatus(status);
-               } else if (rets->size() != ctx->num_outputs()) {
-                 ctx->SetStatus(errors::InvalidArgument(
-                     "SymGrad expects to return ", ctx->num_outputs(),
-                     " tensor(s), but get ", rets->size(),
-                     " tensor(s) instead."));
-               } else {
-                 for (size_t i = 0; i < rets->size(); ++i) {
-                   ctx->set_output(i, (*rets)[i]);
-                 }
-               }
-               delete rets;
-               done();
-             });
+    lib->Run(
+        opts, handle_, args, rets, [ctx, done, rets](const Status& status) {
+          if (!status.ok()) {
+            ctx->SetStatus(status);
+          } else if (rets->size() != ctx->num_outputs()) {
+            ctx->SetStatus(errors::InvalidArgument(
+                "SymGrad expects to return ", ctx->num_outputs(),
+                " tensor(s), but get ", rets->size(), " tensor(s) instead."));
+          } else {
+            for (size_t i = 0; i < rets->size(); ++i) {
+              ctx->set_output(i, (*rets)[i]);
+            }
+          }
+          delete rets;
+          done();
+        });
   }
 
  private:
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 8ac62e45d22..36fb1e97c76 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -60,7 +60,7 @@ void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Device* device,
 
     if (opts_.do_constant_folding()) {
       ConstantFoldingOptions cf_opts;
-      if (DoConstantFolding(cf_opts, device, g)) {
+      if (DoConstantFolding(cf_opts, runtime, device, g)) {
         RemoveDeadNodes(g);
         DumpGraph("ConstFolding", g);
         changed = true;
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
new file mode 100644
index 00000000000..da4c45520e1
--- /dev/null
+++ b/tensorflow/core/debug/BUILD
@@ -0,0 +1,157 @@
+# Description:
+# TensorFlow Debugger (tfdbg).
+#
+# Public Android targets:
+# filegroup ":android_srcs" - Debugger source files for Android.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    features = ["-parse_headers"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_copts",
+    "tf_cc_test",
+    "tf_cuda_library",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
+
+# For platform specific build config
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_kernel_tests_linkstatic",
+)
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+
+tf_cuda_library(
+    name = "debug_gateway_internal",
+    srcs = ["debug_gateway.cc"],
+    hdrs = ["debug_gateway.h"],
+    copts = tf_copts(),
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_tracer",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
+tf_cuda_library(
+    name = "debug_graph_utils",
+    srcs = ["debug_graph_utils.cc"],
+    hdrs = ["debug_graph_utils.h"],
+    copts = tf_copts(),
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
+tf_cuda_library(
+    name = "debug_io_utils",
+    srcs = ["debug_io_utils.cc"],
+    hdrs = ["debug_io_utils.h"],
+    copts = tf_copts(),
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test_gpu(
+    name = "debug_gateway_test",
+    size = "small",
+    args = ["--heap_check=local"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags() + ["nomac"],
+    deps = [
+        ":debug_gateway_internal",
+        ":debug_graph_utils",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:debug_ops",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+tf_cc_test(
+    name = "debug_io_utils_test",
+    size = "small",
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":debug_io_utils",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+filegroup(
+    name = "android_srcs",
+    srcs = [
+        "debug_graph_utils.cc",
+        "debug_graph_utils.h",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+# -----------------------------------------------------------------------------
+# Google-internal targets.  These must be at the end for syncrepo.
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index 8374b245cb6..118847686d3 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -36,6 +36,8 @@ Status DebugNodeInserter::InsertNodes(
   // A map from tensor name (e.g., "node_a:0") to list of debug op names
   // (e.g., {"DebugIdentity", "DebugNanCount"})
   std::unordered_map<string, std::vector<string>> tensor_watches;
+  // A map from tensor name to debug_url.
+  std::unordered_map<string, std::vector<string>> tensor_watch_urls;
 
   // Cache the proto content for fast lookup later
   for (const DebugTensorWatch& watch : watches) {
@@ -58,6 +60,12 @@ Status DebugNodeInserter::InsertNodes(
     }
 
     tensor_watches[tensor_name] = debug_ops;
+
+    std::vector<string> urls;
+    for (const string& url : watch.debug_urls()) {
+      urls.push_back(url);
+    }
+    tensor_watch_urls[tensor_name] = urls;
   }
 
   if (tensor_watches.empty()) {
@@ -150,9 +158,9 @@ Status DebugNodeInserter::InsertNodes(
         const string& debug_op_name = tensor_watches[tensor_name][i];
 
         Node* debug_node;
-        Status debug_s =
-            CreateDebugNode(graph, device_type, copy_node->name(), src_dt,
-                            tensor_name, i, debug_op_name, &debug_node);
+        Status debug_s = CreateDebugNode(
+            graph, device_type, copy_node->name(), src_dt, tensor_name,
+            tensor_watch_urls[tensor_name], i, debug_op_name, &debug_node);
         if (!debug_s.ok()) {
           return Status(
               error::FAILED_PRECONDITION,
@@ -267,17 +275,17 @@ Status DebugNodeInserter::CreateCopyNode(
 Status DebugNodeInserter::CreateDebugNode(
     Graph* graph, const DeviceType device_type,
     const string& src_copy_node_name, const DataType src_dt,
-    const string& tensor_name, const int debug_op_num,
-    const string& debug_op_name, Node** debug_node) {
+    const string& tensor_name, const std::vector<string>& debug_urls,
+    const int debug_op_num, const string& debug_op_name, Node** debug_node) {
   NodeDef node_def;
   const KernelDef* kdef;
 
   const string debug_node_name =
       GetDebugNodeName(tensor_name, debug_op_num, debug_op_name);
-  // TODO(cais): Hook up with DebugTensorWatch proto
   auto builder = NodeDefBuilder(debug_node_name, debug_op_name)
                      .Input(src_copy_node_name, 0, src_dt)
-                     .Attr("tensor_name", tensor_name);
+                     .Attr("tensor_name", tensor_name)
+                     .Attr("debug_urls", debug_urls);
 
   if (!builder.Finalize(&node_def).ok()) {
     return Status(
diff --git a/tensorflow/core/debug/debug_graph_utils.h b/tensorflow/core/debug/debug_graph_utils.h
index 41789a30ffe..ea61dee4d08 100644
--- a/tensorflow/core/debug/debug_graph_utils.h
+++ b/tensorflow/core/debug/debug_graph_utils.h
@@ -94,6 +94,7 @@ class DebugNodeInserter {
                                 const string& src_copy_node_name,
                                 const DataType src_dt,
                                 const string& tensor_name,
+                                const std::vector<string>& debug_urls,
                                 const int debug_op_num,
                                 const string& debug_op_name, Node** debug_node);
 };
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
new file mode 100644
index 00000000000..474577a79c0
--- /dev/null
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -0,0 +1,211 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/debug/debug_io_utils.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/event.pb.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Encapsulate the tensor value inside a Summary proto, and then inside an
+// Event proto.
+Event WrapTensorAsEvent(const string& tensor_name, const string& debug_op,
+                        const Tensor& tensor, const uint64 wall_time_us) {
+  Event event;
+  event.set_wall_time(static_cast<double>(wall_time_us));
+
+  Summary::Value* summ_val = event.mutable_summary()->add_value();
+
+  // Create the debug node_name in the Summary proto.
+  // For example, if tensor_name = "foo/node_a:0", and the debug_op is
+  // "DebugIdentity", the debug node_name in the Summary proto will be
+  // "foo/node_a:0:DebugIdentity".
+  const string debug_node_name = strings::StrCat(tensor_name, ":", debug_op);
+  summ_val->set_node_name(debug_node_name);
+
+  if (tensor.dtype() == DT_STRING) {
+    // Treat DT_STRING specially, so that tensor_util.MakeNdarray can convert
+    // the TensorProto to string-type numpy array. MakeNdarray does not work
+    // with strings encoded by AsProtoTensorContent() in tensor_content.
+    tensor.AsProtoField(summ_val->mutable_tensor());
+  } else {
+    tensor.AsProtoTensorContent(summ_val->mutable_tensor());
+  }
+
+  return event;
+}
+
+}  // namespace
+
+// static
+const char* const DebugIO::kFileURLScheme = "file://";
+// static
+const char* const DebugIO::kGrpcURLScheme = "grpc://";
+
+Status DebugIO::PublishDebugTensor(const string& tensor_name,
+                                   const string& debug_op, const Tensor& tensor,
+                                   const uint64 wall_time_us,
+                                   const gtl::ArraySlice<string>& debug_urls) {
+  // Split the tensor_name into node name and output slot index.
+  std::vector<string> name_items = str_util::Split(tensor_name, ':');
+  string node_name;
+  int32 output_slot = 0;
+  if (name_items.size() == 2) {
+    node_name = name_items[0];
+    if (!strings::safe_strto32(name_items[1], &output_slot)) {
+      return Status(error::INVALID_ARGUMENT,
+                    strings::StrCat("Invalid string value for output_slot: \"",
+                                    name_items[1], "\""));
+    }
+  } else if (name_items.size() == 1) {
+    node_name = name_items[0];
+  } else {
+    return Status(
+        error::INVALID_ARGUMENT,
+        strings::StrCat("Failed to parse tensor name: \"", tensor_name, "\""));
+  }
+
+  int num_failed_urls = 0;
+  for (const string& url : debug_urls) {
+    if (str_util::Lowercase(url).find(kFileURLScheme) == 0) {
+      const string dump_root_dir = url.substr(strlen(kFileURLScheme));
+
+      Status s =
+          DebugFileIO::DumpTensorToDir(node_name, output_slot, debug_op, tensor,
+                                       wall_time_us, dump_root_dir, nullptr);
+      if (!s.ok()) {
+        num_failed_urls++;
+      }
+    } else if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) {
+      // TODO(cais): Implement PublishTensor with grpc urls.
+      return Status(error::UNIMPLEMENTED,
+                    strings::StrCat("Puslishing to GRPC debug target is not ",
+                                    "implemented yet"));
+    } else {
+      return Status(error::UNAVAILABLE,
+                    strings::StrCat("Invalid debug target URL: ", url));
+    }
+  }
+
+  if (num_failed_urls == 0) {
+    return Status::OK();
+  } else {
+    return Status(
+        error::INTERNAL,
+        strings::StrCat("Puslishing to ", num_failed_urls, " of ",
+                        debug_urls.size(), " debug target URLs failed"));
+  }
+}
+
+// static
+Status DebugFileIO::DumpTensorToDir(
+    const string& node_name, const int32 output_slot, const string& debug_op,
+    const Tensor& tensor, const uint64 wall_time_us,
+    const string& dump_root_dir, string* dump_file_path) {
+  const string file_path = GetDumpFilePath(dump_root_dir, node_name,
+                                           output_slot, debug_op, wall_time_us);
+
+  if (dump_file_path != nullptr) {
+    *dump_file_path = file_path;
+  }
+
+  return DumpTensorToEventFile(node_name, output_slot, debug_op, tensor,
+                               wall_time_us, file_path);
+}
+
+// static
+string DebugFileIO::GetDumpFilePath(const string& dump_root_dir,
+                                    const string& node_name,
+                                    const int32 output_slot,
+                                    const string& debug_op,
+                                    const uint64 wall_time_us) {
+  return io::JoinPath(
+      dump_root_dir, strings::StrCat(node_name, "_", output_slot, "_", debug_op,
+                                     "_", wall_time_us));
+}
+
+// static
+Status DebugFileIO::DumpTensorToEventFile(
+    const string& node_name, const int32 output_slot, const string& debug_op,
+    const Tensor& tensor, const uint64 wall_time_us, const string& file_path) {
+  Env* env(Env::Default());
+
+  // Create the directory if necessary.
+  string file_dir = io::Dirname(file_path).ToString();
+  Status s = DebugFileIO::RecursiveCreateDir(env, file_dir);
+
+  if (!s.ok()) {
+    return Status(error::FAILED_PRECONDITION,
+                  strings::StrCat("Failed to create directory  ", file_dir,
+                                  ", due to: ", s.error_message()));
+  }
+
+  const string tensor_name = strings::StrCat(node_name, ":", output_slot);
+  Event event = WrapTensorAsEvent(tensor_name, debug_op, tensor, wall_time_us);
+
+  string event_str;
+  event.SerializeToString(&event_str);
+
+  std::unique_ptr<WritableFile> f = nullptr;
+  TF_CHECK_OK(env->NewWritableFile(file_path, &f));
+  f->Append(event_str);
+  TF_CHECK_OK(f->Close());
+
+  return Status::OK();
+}
+
+// static
+Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
+  if (env->FileExists(dir) && env->IsDirectory(dir).ok()) {
+    // The path already exists as a directory. Return OK right away.
+    return Status::OK();
+  }
+
+  string parent_dir = io::Dirname(dir).ToString();
+  if (!env->FileExists(parent_dir)) {
+    // The parent path does not exist yet, create it first.
+    Status s = RecursiveCreateDir(env, parent_dir);  // Recursive call
+    if (!s.ok()) {
+      return Status(
+          error::FAILED_PRECONDITION,
+          strings::StrCat("Failed to create directory  ", parent_dir));
+    }
+  } else if (env->FileExists(parent_dir) &&
+             !env->IsDirectory(parent_dir).ok()) {
+    // The path exists, but it is a file.
+    return Status(error::FAILED_PRECONDITION,
+                  strings::StrCat("Failed to create directory  ", parent_dir,
+                                  " because the path exists as a file "));
+  }
+
+  env->CreateDir(dir);
+  // Guard against potential race in creating directories by doing a check
+  // after the CreateDir call.
+  if (env->FileExists(dir) && env->IsDirectory(dir).ok()) {
+    return Status::OK();
+  } else {
+    return Status(error::ABORTED,
+                  strings::StrCat("Failed to create directory  ", parent_dir));
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
new file mode 100644
index 00000000000..553ae9ab7d2
--- /dev/null
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -0,0 +1,107 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DEBUG_IO_UTILS_H_
+#define TENSORFLOW_DEBUG_IO_UTILS_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+class DebugIO {
+ public:
+  // Publish a tensor to a debug target URL.
+  //
+  // Args:
+  //   tensor_name: Name of the tensor being published: node_name followed by
+  //     a colon, followed by the output slot index. E.g., "node_a:0".
+  //   debug_op: Name of the debug op, e.g., "DebugIdentity".
+  //   tensor: The Tensor object being published.
+  //   wall_time_us: Time stamp for the Tensor. Unit: microseconds (us).
+  //   debug_urls: An array of debug target URLs, e.g.,
+  //     "file:///foo/tfdbg_dump", "grpc://localhot:11011"
+  static Status PublishDebugTensor(const string& tensor_name,
+                                   const string& debug_op, const Tensor& tensor,
+                                   const uint64 wall_time_us,
+                                   const gtl::ArraySlice<string>& debug_urls);
+
+ private:
+  static const char* const kFileURLScheme;
+  static const char* const kGrpcURLScheme;
+};
+
+// Helper class for debug ops.
+class DebugFileIO {
+ public:
+  // Encapsulate the Tensor in an Event protobuf and write it to a directory.
+  // The actual path of the dump file will be a contactenation of
+  // dump_root_dir, tensor_name, along with the wall_time.
+  //
+  // For example:
+  //   let dump_root_dir = "/tmp/tfdbg_dump",
+  //       node_name = "foo/bar",
+  //       output_slot = 0,
+  //       debug_op = DebugIdentity,
+  //       and wall_time_us = 1467891234512345,
+  // the dump file will be generated at path:
+  //   /tmp/tfdbg_dump/foo/bar_0_DebugIdentity_1467891234512345.
+  //
+  // Args:
+  //   node_name: Name of the node from which the tensor is output.
+  //   output_slot: Output slot index.
+  //   debug_op: Name of the debug op, e.g., "DebugIdentity".
+  //   tensor: The Tensor object to be dumped to file.
+  //   wall_time_us: Wall time at which the Tensor is generated during graph
+  //     execution. Unit: microseconds (us).
+  //   dump_root_dir: Root diretory for dumping the tensor.
+  //   dump_file_path: The actual dump file path (passed as reference).
+  static Status DumpTensorToDir(const string& node_name,
+                                const int32 output_slot, const string& debug_op,
+                                const Tensor& tensor, const uint64 wall_time_us,
+                                const string& dump_root_dir,
+                                string* dump_file_path);
+
+  // Get the full path to the dump file.
+  //
+  // Args:
+  //   dump_root_dir: The dump root directory, e.g., /tmp/tfdbg_dump
+  //   node_name: Name of the node from which the dumped tensor is generated,
+  //     e.g., foo/bar/node_a
+  //   output_slot: Output slot index of the said node, e.g., 0.
+  //   debug_op: Name of the debug op, e.g., DebugIdentity.
+  //   wall_time_us: Time stamp of the dumped tensor, in microseconds (us).
+  static string GetDumpFilePath(const string& dump_root_dir,
+                                const string& node_name,
+                                const int32 output_slot, const string& debug_op,
+                                const uint64 wall_time_us);
+
+ private:
+  // Encapsulate the Tensor in an Event protobuf and write it to file.
+  static Status DumpTensorToEventFile(
+      const string& node_name, const int32 output_slot, const string& debug_op,
+      const Tensor& tensor, const uint64 wall_time_us, const string& file_path);
+
+  // Implemented ad hoc here for now.
+  // TODO(cais): Replace with shared implementation once http://b/30497715 is
+  // fixed.
+  static Status RecursiveCreateDir(Env* env, const string& dir);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DEBUG_IO_UTILS_H_
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
new file mode 100644
index 00000000000..ecdda643c3a
--- /dev/null
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -0,0 +1,382 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/debug/debug_io_utils.h"
+
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/util/event.pb.h"
+
+namespace tensorflow {
+namespace {
+
+class DebugIOUtilsTest : public ::testing::Test {
+ public:
+  void Initialize() {
+    env_ = Env::Default();
+
+    tensor_a_.reset(new Tensor(DT_FLOAT, TensorShape({2, 2})));
+    tensor_a_->flat<float>()(0) = 5.0;
+    tensor_a_->flat<float>()(1) = 3.0;
+    tensor_a_->flat<float>()(2) = -1.0;
+    tensor_a_->flat<float>()(3) = 0.0;
+
+    tensor_b_.reset(new Tensor(DT_STRING, TensorShape{2}));
+    tensor_b_->flat<string>()(0) = "corge";
+    tensor_b_->flat<string>()(1) = "garply";
+  }
+
+  Status ReadEventFromFile(const string& dump_file_path, Event* event) {
+    string content;
+    uint64 file_size = 0;
+
+    Status s = env_->GetFileSize(dump_file_path, &file_size);
+    if (!s.ok()) {
+      return s;
+    }
+
+    content.resize(file_size);
+
+    std::unique_ptr<RandomAccessFile> file;
+    s = env_->NewRandomAccessFile(dump_file_path, &file);
+    if (!s.ok()) {
+      return s;
+    }
+
+    StringPiece result;
+    s = file->Read(0, file_size, &result, &(content)[0]);
+    if (!s.ok()) {
+      return s;
+    }
+
+    event->ParseFromString(content);
+    return Status::OK();
+  }
+
+  Env* env_;
+  std::unique_ptr<Tensor> tensor_a_;
+  std::unique_ptr<Tensor> tensor_b_;
+};
+
+TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) {
+  Initialize();
+
+  const string test_dir = testing::TmpDir();
+
+  // Append levels of nonexisting directories, to test that the function can
+  // create directories.
+  const string kNodeName = "foo/bar/qux/tensor_a";
+  const string kDebugOpName = "DebugIdentity";
+  const int32 output_slot = 0;
+  uint64 wall_time = env_->NowMicros();
+
+  string dump_file_path;
+  TF_ASSERT_OK(DebugFileIO::DumpTensorToDir(kNodeName, output_slot,
+                                            kDebugOpName, *tensor_a_, wall_time,
+                                            test_dir, &dump_file_path));
+
+  // Read the file into a Event proto.
+  Event event;
+  TF_ASSERT_OK(ReadEventFromFile(dump_file_path, &event));
+
+  ASSERT_GE(wall_time, event.wall_time());
+  ASSERT_EQ(1, event.summary().value().size());
+  ASSERT_EQ(strings::StrCat(kNodeName, ":", output_slot, ":", kDebugOpName),
+            event.summary().value(0).node_name());
+
+  Tensor a_prime(DT_FLOAT);
+  ASSERT_TRUE(a_prime.FromProto(event.summary().value(0).tensor()));
+
+  // Verify tensor shape and value.
+  ASSERT_EQ(tensor_a_->shape(), a_prime.shape());
+  for (int i = 0; i < a_prime.flat<float>().size(); ++i) {
+    ASSERT_EQ(tensor_a_->flat<float>()(i), a_prime.flat<float>()(i));
+  }
+
+  // Tear down temporary file and directories.
+  int64 undeleted_files = 0;
+  int64 undeleted_dirs = 0;
+  ASSERT_TRUE(
+      env_->DeleteRecursively(test_dir, &undeleted_files, &undeleted_dirs)
+          .ok());
+  ASSERT_EQ(0, undeleted_files);
+  ASSERT_EQ(0, undeleted_dirs);
+}
+
+TEST_F(DebugIOUtilsTest, DumpStringTensorToFileSunnyDay) {
+  Initialize();
+
+  const string test_dir = testing::TmpDir();
+
+  const string kNodeName = "quux/grault/tensor_b";
+  const string kDebugOpName = "DebugIdentity";
+  const int32 output_slot = 1;
+  uint64 wall_time = env_->NowMicros();
+
+  string dump_file_name;
+  Status s = DebugFileIO::DumpTensorToDir(kNodeName, output_slot, kDebugOpName,
+                                          *tensor_b_, wall_time, test_dir,
+                                          &dump_file_name);
+  ASSERT_TRUE(s.ok());
+
+  // Read the file into a Event proto.
+  Event event;
+  TF_ASSERT_OK(ReadEventFromFile(dump_file_name, &event));
+
+  ASSERT_GE(wall_time, event.wall_time());
+  ASSERT_EQ(1, event.summary().value().size());
+  ASSERT_EQ(strings::StrCat(kNodeName, ":", output_slot, ":", kDebugOpName),
+            event.summary().value(0).node_name());
+
+  Tensor b_prime(DT_STRING);
+  ASSERT_TRUE(b_prime.FromProto(event.summary().value(0).tensor()));
+
+  // Verify tensor shape and value.
+  ASSERT_EQ(tensor_b_->shape(), b_prime.shape());
+  for (int i = 0; i < b_prime.flat<string>().size(); ++i) {
+    ASSERT_EQ(tensor_b_->flat<string>()(i), b_prime.flat<string>()(i));
+  }
+
+  // Tear down temporary file and directories.
+  int64 undeleted_files = 0;
+  int64 undeleted_dirs = 0;
+  ASSERT_TRUE(
+      env_->DeleteRecursively(test_dir, &undeleted_files, &undeleted_dirs)
+          .ok());
+  ASSERT_EQ(0, undeleted_files);
+  ASSERT_EQ(0, undeleted_dirs);
+}
+
+TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
+  Initialize();
+
+  // First, create the file at the path.
+  const string test_dir = testing::TmpDir();
+  const string txt_file_name = strings::StrCat(test_dir, "/baz");
+
+  if (!env_->FileExists(test_dir)) {
+    ASSERT_TRUE(env_->CreateDir(test_dir).ok());
+  }
+  ASSERT_FALSE(env_->FileExists(txt_file_name));
+
+  std::unique_ptr<WritableFile> file;
+  ASSERT_TRUE(env_->NewWritableFile(txt_file_name, &file).ok());
+  file->Append("text in baz");
+  file->Flush();
+  file->Close();
+
+  // Verify that the path exists and that it is a file, not a directory.
+  ASSERT_TRUE(env_->FileExists(txt_file_name));
+  ASSERT_FALSE(env_->IsDirectory(txt_file_name).ok());
+
+  // Second, try to dump the tensor to a path that requires "baz" to be a
+  // directory, which should lead to an error.
+  const string kNodeName = "baz/tensor_a";
+  const string kDebugOpName = "DebugIdentity";
+  const int32 output_slot = 0;
+  uint64 wall_time = env_->NowMicros();
+
+  string dump_file_name;
+  Status s = DebugFileIO::DumpTensorToDir(kNodeName, output_slot, kDebugOpName,
+                                          *tensor_a_, wall_time, test_dir,
+                                          &dump_file_name);
+  ASSERT_FALSE(s.ok());
+
+  // Tear down temporary file and directories.
+  int64 undeleted_files = 0;
+  int64 undeleted_dirs = 0;
+  ASSERT_TRUE(
+      env_->DeleteRecursively(test_dir, &undeleted_files, &undeleted_dirs)
+          .ok());
+  ASSERT_EQ(0, undeleted_files);
+  ASSERT_EQ(0, undeleted_dirs);
+}
+
+TEST_F(DebugIOUtilsTest, PublishTensorToMultipleFileURLs) {
+  Initialize();
+
+  const int kNumDumpRoots = 3;
+  const string kNodeName = "foo/bar/qux/tensor_a";
+  const string kDebugOpName = "DebugIdentity";
+  const int32 output_slot = 0;
+
+  uint64 wall_time = env_->NowMicros();
+
+  std::vector<string> dump_roots;
+  std::vector<string> dump_file_paths;
+  std::vector<string> urls;
+  for (int i = 0; i < kNumDumpRoots; ++i) {
+    string dump_root = strings::StrCat(testing::TmpDir(), "/", i);
+
+    dump_roots.push_back(dump_root);
+    dump_file_paths.push_back(DebugFileIO::GetDumpFilePath(
+        dump_root, kNodeName, output_slot, kDebugOpName, wall_time));
+    urls.push_back(strings::StrCat("file://", dump_root));
+  }
+
+  for (int i = 1; i < kNumDumpRoots; ++i) {
+    ASSERT_NE(dump_roots[0], dump_roots[i]);
+  }
+
+  const string tensor_name = strings::StrCat(kNodeName, ":", output_slot);
+  const string debug_node_name =
+      strings::StrCat(tensor_name, ":", kDebugOpName);
+  Status s = DebugIO::PublishDebugTensor(tensor_name, kDebugOpName, *tensor_a_,
+                                         wall_time, urls);
+  ASSERT_TRUE(s.ok());
+
+  // Try reading the file into a Event proto.
+  for (int i = 0; i < kNumDumpRoots; ++i) {
+    // Read the file into a Event proto.
+    Event event;
+    TF_ASSERT_OK(ReadEventFromFile(dump_file_paths[i], &event));
+
+    ASSERT_GE(wall_time, event.wall_time());
+    ASSERT_EQ(1, event.summary().value().size());
+    ASSERT_EQ(debug_node_name, event.summary().value(0).node_name());
+
+    Tensor a_prime(DT_FLOAT);
+    ASSERT_TRUE(a_prime.FromProto(event.summary().value(0).tensor()));
+
+    // Verify tensor shape and value.
+    ASSERT_EQ(tensor_a_->shape(), a_prime.shape());
+    for (int i = 0; i < a_prime.flat<float>().size(); ++i) {
+      ASSERT_EQ(tensor_a_->flat<float>()(i), a_prime.flat<float>()(i));
+    }
+  }
+
+  // Tear down temporary file and directories.
+  for (int i = 0; i < kNumDumpRoots; ++i) {
+    int64 undeleted_files = 0;
+    int64 undeleted_dirs = 0;
+    ASSERT_TRUE(env_->DeleteRecursively(dump_roots[i], &undeleted_files,
+                                        &undeleted_dirs)
+                    .ok());
+    ASSERT_EQ(0, undeleted_files);
+    ASSERT_EQ(0, undeleted_dirs);
+  }
+}
+
+TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
+  Initialize();
+
+  const int kConcurrentPubs = 3;
+  const string kNodeName = "tensor_a";
+  const string kDebugOpName = "DebugIdentity";
+  const int32 kOutputSlot = 0;
+
+  thread::ThreadPool* tp =
+      new thread::ThreadPool(Env::Default(), "test", kConcurrentPubs);
+  uint64 wall_time = env_->NowMicros();
+
+  const string dump_root_base = testing::TmpDir();
+  const string tensor_name = strings::StrCat(kNodeName, ":", kOutputSlot);
+  const string debug_node_name =
+      strings::StrCat(tensor_name, ":", kDebugOpName);
+
+  mutex mu;
+  std::vector<string> dump_roots GUARDED_BY(mu);
+  std::vector<string> dump_file_paths GUARDED_BY(mu);
+
+  int dump_count GUARDED_BY(mu) = 0;
+  int done_count GUARDED_BY(mu) = 0;
+  Notification all_done;
+
+  auto fn = [this, &dump_count, &done_count, &mu, &dump_root_base, &dump_roots,
+             &dump_file_paths, &wall_time, &tensor_name, &debug_node_name,
+             &kNodeName, &kDebugOpName, &kConcurrentPubs, &all_done]() {
+    // "gumpy" is the shared directory part of the path.
+    string dump_root;
+    string debug_url;
+    {
+      mutex_lock l(mu);
+      dump_root =
+          strings::StrCat(dump_root_base, "grumpy/", "dump_", dump_count++);
+
+      dump_roots.push_back(dump_root);
+      dump_file_paths.push_back(DebugFileIO::GetDumpFilePath(
+          dump_root, kNodeName, kOutputSlot, kDebugOpName, wall_time));
+
+      debug_url = strings::StrCat("file://", dump_root);
+    }
+
+    std::vector<string> urls;
+    urls.push_back(debug_url);
+    Status s = DebugIO::PublishDebugTensor(tensor_name, kDebugOpName,
+                                           *tensor_a_, wall_time, urls);
+    ASSERT_TRUE(s.ok());
+
+    {
+      mutex_lock l(mu);
+
+      done_count++;
+      if (done_count == kConcurrentPubs) {
+        all_done.Notify();
+      }
+    }
+  };
+
+  for (int i = 0; i < kConcurrentPubs; ++i) {
+    tp->Schedule(fn);
+  }
+
+  // Wait for all dumping calls to finish.
+  all_done.WaitForNotification();
+  delete tp;
+
+  {
+    mutex_lock l(mu);
+
+    for (int i = 1; i < kConcurrentPubs; ++i) {
+      ASSERT_NE(dump_roots[0], dump_roots[i]);
+    }
+
+    // Try reading the file into a Event proto.
+    for (int i = 0; i < kConcurrentPubs; ++i) {
+      // Read the file into a Event proto.
+      Event event;
+      TF_ASSERT_OK(ReadEventFromFile(dump_file_paths[i], &event));
+
+      ASSERT_GE(wall_time, event.wall_time());
+      ASSERT_EQ(1, event.summary().value().size());
+      ASSERT_EQ(debug_node_name, event.summary().value(0).node_name());
+
+      Tensor a_prime(DT_FLOAT);
+      ASSERT_TRUE(a_prime.FromProto(event.summary().value(0).tensor()));
+
+      // Verify tensor shape and value.
+      ASSERT_EQ(tensor_a_->shape(), a_prime.shape());
+      for (int i = 0; i < a_prime.flat<float>().size(); ++i) {
+        ASSERT_EQ(tensor_a_->flat<float>()(i), a_prime.flat<float>()(i));
+      }
+    }
+
+    // Tear down temporary file and directories.
+    int64 undeleted_files = 0;
+    int64 undeleted_dirs = 0;
+    ASSERT_TRUE(env_->DeleteRecursively(dump_root_base, &undeleted_files,
+                                        &undeleted_dirs)
+                    .ok());
+    ASSERT_EQ(0, undeleted_files);
+    ASSERT_EQ(0, undeleted_dirs);
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 51bb908213c..244ba8bb7e1 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -51,9 +51,14 @@ cc_library(
 
 cc_library(
     name = "worker_interface",
-    hdrs = ["worker_interface.h"],
+    srcs = ["tensor_coding.cc"],
+    hdrs = [
+        "tensor_coding.h",
+        "worker_interface.h",
+    ],
     deps = [
         ":call_options",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:worker_proto_cc",
     ],
@@ -79,6 +84,22 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "tensor_coding_test",
+    size = "small",
+    srcs = ["tensor_coding_test.cc"],
+    linkstatic = 1,
+    deps = [
+        ":worker_interface",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:worker_proto_cc",
+    ],
+)
+
 cc_library(
     name = "worker_cache",
     hdrs = ["worker_cache.h"],
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index d9fd1d406ad..ea8de56e324 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -182,8 +182,8 @@ Status GrpcServer::Init() {
 
   // Finish setting up worker environment.
   worker_env_.graph_mgr = new GraphMgr(&worker_env_);
-  worker_env_.rendezvous_mgr = new RpcRendezvousMgr(&worker_env_);
   worker_env_.compute_pool = ComputePool(sess_opts);
+  worker_env_.rendezvous_mgr = new RpcRendezvousMgr(&worker_env_);
 
   return Status::OK();
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 96f7db2694b..b01d603c6a6 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -37,8 +37,9 @@ namespace {
 
 class RpcRemoteRendezvous : public BaseRemoteRendezvous {
  public:
-  RpcRemoteRendezvous(const WorkerEnv* env, int64 step_id)
-      : BaseRemoteRendezvous(env, step_id, false) {}
+  RpcRemoteRendezvous(const WorkerEnv* env, WorkerCacheInterface* cache,
+                      int64 step_id)
+      : BaseRemoteRendezvous(env, step_id, false), cache_(cache) {}
 
  protected:
   void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
@@ -48,6 +49,7 @@ class RpcRemoteRendezvous : public BaseRemoteRendezvous {
  private:
   ~RpcRemoteRendezvous() override {}
 
+  WorkerCacheInterface* cache_;  // Not owned.
   TF_DISALLOW_COPY_AND_ASSIGN(RpcRemoteRendezvous);
 };
 
@@ -55,13 +57,12 @@ class RpcRemoteRendezvous : public BaseRemoteRendezvous {
 class RpcRecvTensorCall : public BaseRecvTensorCall {
  public:
   RpcRecvTensorCall()
-      : wi_(nullptr), wc_(nullptr), allocator_(nullptr), dst_device_(nullptr) {}
+      : wi_(nullptr), allocator_(nullptr), dst_device_(nullptr) {}
 
-  void Init(WorkerCacheInterface* wc, WorkerInterface* wi, int64 step_id,
-            StringPiece key, Allocator* allocator, Device* dst_device,
+  void Init(WorkerInterface* wi, int64 step_id, StringPiece key,
+            Allocator* allocator, Device* dst_device,
             const Rendezvous::Args& recv_args, Rendezvous::DoneCallback done) {
     wi_ = wi;
-    wc_ = wc;
     allocator_ = allocator;
     dst_device_ = dst_device;
     recv_args_ = recv_args;
@@ -73,7 +74,6 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
   void Reset() {
     delete wi_;
     wi_ = nullptr;
-    wc_ = nullptr;
     allocator_ = nullptr;
     dst_device_ = nullptr;
     // We don't clear opts_ and assume that Init will set up the state for
@@ -123,6 +123,8 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
   const Rendezvous::DoneCallback& done() const { return done_; }
 
  private:
+  friend class RpcRemoteRendezvous;
+
   // Start the main RecvTensor call, checking for an async abort.
   void StartRTCall(std::function<void()> recv_done) {
     wi_->RecvTensorAsync(&opts_, &req_, &resp_,
@@ -137,8 +139,9 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
                          });
   }
 
-  WorkerInterface* wi_;       // Owned.
-  WorkerCacheInterface* wc_;  // Not owned.
+  string src_worker_;
+  string src_rel_device_;
+  WorkerInterface* wi_;
   Allocator* allocator_;
   Device* dst_device_;
   CallOptions opts_;
@@ -153,7 +156,6 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
   TF_DISALLOW_COPY_AND_ASSIGN(RpcRecvTensorCall);
 };
 
-namespace {
 class RpcRecvTensorFreeList {
  public:
   RpcRecvTensorFreeList() {}
@@ -195,32 +197,99 @@ class RpcRecvTensorFreeList {
 };
 
 static RpcRecvTensorFreeList call_freelist_;
-}
+
+// A private cache that wraps env->worker_cache and allows reuse of
+// WorkerInterface objects.
+class WorkerFreeListCache : public WorkerCacheInterface {
+ public:
+  explicit WorkerFreeListCache(WorkerCacheInterface* w) : wrapped_(w) {}
+
+  ~WorkerFreeListCache() {
+    for (auto p : workers_) {
+      delete p.second.worker;
+    }
+  }
+
+  void ListWorkers(std::vector<string>* workers) override {
+    wrapped_->ListWorkers(workers);
+  }
+
+  WorkerInterface* CreateWorker(const string& target) override {
+    mutex_lock l(mu_);
+    auto p = workers_.find(target);
+    if (p != workers_.end()) {
+      return p->second.worker;
+    }
+    WorkerState state;
+    state.worker = wrapped_->CreateWorker(target);
+    if (state.worker != nullptr) {
+      workers_.insert(make_pair(target, state));
+    }
+    return state.worker;
+  }
+
+  void ReleaseWorker(const string& target, WorkerInterface* worker) override {
+    // TODO(jeff,sanjay): Should decrement ref-count when we implement eviction.
+  }
+
+  bool GetDeviceBusNonBlocking(const string& device,
+                               BusAdjacency* ba) override {
+    return wrapped_->GetDeviceBusNonBlocking(device, ba);
+  }
+
+  void GetDeviceBusAsync(const string& device, BusAdjacency* ba,
+                         StatusCallback done) override {
+    wrapped_->GetDeviceBusAsync(device, ba, done);
+  }
+
+  void SetLogging(bool active) override { wrapped_->SetLogging(active); }
+
+  void ClearLogs() override { wrapped_->ClearLogs(); }
+
+  bool RetrieveLogs(int64 step_id, StepStats* ss) override {
+    return wrapped_->RetrieveLogs(step_id, ss);
+  }
+
+ private:
+  WorkerCacheInterface* wrapped_;
+
+  // Information kept per created WorkerInterface.
+  struct WorkerState {
+    WorkerInterface* worker;
+    // TODO(jeff,sanjay): Add reference count if we support eviction.
+  };
+
+  // TODO(jeff,sanjay): Eviction when the map becomes too big.
+  mutex mu_;
+  std::unordered_map<string, WorkerState> workers_ GUARDED_BY(mu_);
+};
 
 void RpcRemoteRendezvous::RecvFromRemoteAsync(
     const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args,
     DoneCallback done) {
   Status s;
 
-  // key.src_device identifies a remote device.
-  string src_worker;
-  string src_rel_device;
-  if (!DeviceNameUtils::SplitDeviceName(parsed.src_device, &src_worker,
-                                        &src_rel_device)) {
-    s = errors::Internal(parsed.src_device,
-                         " is invalid remote source device.");
-  }
   // TODO(jeff): Consider checking for a valid worker_cache during the
   // constructor of RpcRemoteRendezvous, rather than here, to simplify
   // the twisty logic below.
-  WorkerCacheInterface* worker_cache = env_->worker_cache;
-  if (s.ok() && worker_cache == nullptr) {
+  if (env_->worker_cache == nullptr) {
     s = errors::Internal("No remote worker cache available.");
+    done(s, Args(), recv_args, Tensor{}, false);
+    return;
   }
-  WorkerInterface* rwi =
-      (worker_cache ? worker_cache->CreateWorker(src_worker) : nullptr);
+
+  // Prepare a RecvTensor call that can handle being aborted.
+  RpcRecvTensorCall* call = call_freelist_.New();
+
+  // key.src_device identifies a remote device.
+  if (!DeviceNameUtils::SplitDeviceName(parsed.src_device, &call->src_worker_,
+                                        &call->src_rel_device_)) {
+    s = errors::Internal(parsed.src_device,
+                         " is invalid remote source device.");
+  }
+  WorkerInterface* rwi = cache_->CreateWorker(call->src_worker_);
   if (s.ok() && rwi == nullptr) {
-    s = errors::Internal("No worker known as ", src_worker);
+    s = errors::Internal("No worker known as ", call->src_worker_);
   }
 
   Device* dst_device;
@@ -228,21 +297,20 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
   }
   if (!s.ok()) {
+    call_freelist_.Release(call);
     done(s, Args(), recv_args, Tensor{}, false);
     return;
   }
   Allocator* allocator = dst_device->GetAllocator(recv_args.alloc_attrs);
 
-  // Prepare a RecvTensor call that can handle being aborted.
-  RpcRecvTensorCall* call = call_freelist_.New();
-
-  call->Init(worker_cache, rwi, step_id_, parsed.FullKey(), allocator,
-             dst_device, recv_args, std::move(done));
+  call->Init(rwi, step_id_, parsed.FullKey(), allocator, dst_device, recv_args,
+             std::move(done));
 
   // Record "call" in active_ so that it can be aborted cleanly.
   RegisterCall(call);
 
   // Start "call".
+  Ref();
   call->Start([this, call]() {
     // Removes "call" from active_. Prevent StartAbort().
     DeregisterCall(call);
@@ -255,15 +323,22 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
           call->tensor_proto(), call->recv_args().alloc_attrs, &val);
     }
     call->done()(s, Args(), call->recv_args(), val, call->is_dead());
+    cache_->ReleaseWorker(call->src_worker_, call->wi_);
+    call->wi_ = nullptr;
     call_freelist_.Release(call);
+    Unref();
   });
 }
 
 }  // namespace
 
+RpcRendezvousMgr::RpcRendezvousMgr(const WorkerEnv* env)
+    : BaseRendezvousMgr(env),
+      cache_(new WorkerFreeListCache(env->worker_cache)) {}
+
 BaseRemoteRendezvous* RpcRendezvousMgr::Create(int64 step_id,
                                                const WorkerEnv* worker_env) {
-  return new RpcRemoteRendezvous(worker_env, step_id);
+  return new RpcRemoteRendezvous(worker_env, cache_.get(), step_id);
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
index 7447c94c392..6a65d04ba47 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_RPC_RENDEZVOUS_MGR_H_
 
 #include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/platform/macros.h"
 
@@ -42,13 +43,16 @@ namespace tensorflow {
 // RendezvousMgr must have keys generated by Rendezvous::CreateKey.
 class RpcRendezvousMgr : public BaseRendezvousMgr {
  public:
-  explicit RpcRendezvousMgr(const WorkerEnv* env) : BaseRendezvousMgr(env) {}
+  explicit RpcRendezvousMgr(const WorkerEnv* env);
 
  protected:
   BaseRemoteRendezvous* Create(int64 step_id,
                                const WorkerEnv* worker_env) override;
 
  private:
+  // Private cache_ that allows us to reuse WorkerInterface objects.
+  std::unique_ptr<WorkerCacheInterface> cache_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(RpcRendezvousMgr);
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index 7e18278f309..dce49d33d77 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -46,10 +46,28 @@ Rendezvous::ParsedKey MakeKey(const string& s) {
   return key;
 }
 
+namespace {
+// Fake cache implementation for WorkerEnv.
+class DummyWorkerCache : public WorkerCacheInterface {
+  void ListWorkers(std::vector<string>* workers) override {}
+  WorkerInterface* CreateWorker(const string& target) override {
+    return nullptr;
+  }
+  bool GetDeviceBusNonBlocking(const string& device,
+                               BusAdjacency* ba) override {
+    return false;
+  }
+  void GetDeviceBusAsync(const string& device, BusAdjacency* ba,
+                         StatusCallback done) override {}
+};
+}  // namespace
+
 TEST(RpcRendezvousMgrTest, LocalSendRecv) {
+  DummyWorkerCache cache;
   WorkerEnv env;
   env.env = Env::Default();
   env.worker_name = "/job:mnist/replica:1/task:2";
+  env.worker_cache = &cache;
   RpcRendezvousMgr rmgr(&env);
   const int64 step_id = 123;
   const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
@@ -71,9 +89,11 @@ TEST(RpcRendezvousMgrTest, LocalSendRecv) {
 }
 
 TEST(RpcRendezvousMgrTest, LocalAbort) {
+  DummyWorkerCache cache;
   WorkerEnv env;
   env.env = Env::Default();
   env.worker_name = "/job:mnist/replica:1/task:2";
+  env.worker_cache = &cache;
   RpcRendezvousMgr rmgr(&env);
   const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
       "/job:mnist/replica:1/task:2/cpu:0", 7890,
@@ -107,9 +127,11 @@ TEST(RpcRendezvousMgrTest, LocalAbort) {
 }
 
 TEST(RpcRendezvousMgrTest, CleanupAll) {
+  DummyWorkerCache cache;
   WorkerEnv env;
   env.env = Env::Default();
   env.worker_name = "/job:mnist/replica:1/task:2";
+  env.worker_cache = &cache;
   RpcRendezvousMgr rmgr(&env);
   const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
       "/job:mnist/replica:1/task:2/cpu:0", 7890,
@@ -140,9 +162,11 @@ class DummyDeviceContext : public DeviceContext {
 TEST(RpcRendezvousMgrTest, TransferDummyDeviceContext) {
   DummyDeviceContext* dc = new DummyDeviceContext(123);
 
+  DummyWorkerCache cache;
   WorkerEnv env;
   env.env = Env::Default();
   env.worker_name = "/job:mnist/replica:1/task:2";
+  env.worker_cache = &cache;
   RpcRendezvousMgr rmgr(&env);
   const int64 step_id = 123;
   const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
new file mode 100644
index 00000000000..72399c9b11f
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -0,0 +1,221 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/tensor_coding.h"
+
+namespace tensorflow {
+
+TensorResponse::TensorResponse(Allocator* allocator) : allocator_(allocator) {}
+
+Status TensorResponse::ParseFrom(Source* source) {
+  if (already_used_) {
+    Clear();
+  }
+  already_used_ = true;
+  if (ParseFast(source)) return Status::OK();
+  meta_.Clear();
+  if (ParseSlow(source)) return Status::OK();
+  return errors::InvalidArgument("Cannot parse tensor from response");
+}
+
+// Define some helper routines for decoding protocol buffer wire format data
+namespace {
+// We only need some of the wiretype values for this code
+enum WireType {
+  WIRETYPE_VARINT = 0,
+  WIRETYPE_LENGTH_DELIMITED = 2,
+};
+inline int GetTagFieldNumber(uint32 tag) { return tag >> 3; }
+inline WireType GetTagWireType(uint32 tag) {
+  return static_cast<WireType>(tag & 0x7);
+}
+
+bool ReadVarintSizeAsInt(protobuf::io::CodedInputStream* input, int* result) {
+  uint64 v;
+  if (input->ReadVarint64(&v) && v <= static_cast<uint64>(INT_MAX)) {
+    *result = static_cast<int>(v);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool ReadNestedMessage(protobuf::io::CodedInputStream* input,
+                       protobuf::Message* value) {
+  int length;
+  if (!ReadVarintSizeAsInt(input, &length)) return false;
+  std::pair<protobuf::io::CodedInputStream::Limit, int> p =
+      input->IncrementRecursionDepthAndPushLimit(length);
+  if (p.second < 0 || !value->MergePartialFromCodedStream(input)) return false;
+  // Make sure that parsing stopped when the limit was hit, not at an endgroup
+  // tag.
+  return input->DecrementRecursionDepthAndPopLimit(p.first);
+}
+
+}  // namespace
+
+bool TensorResponse::ParseTensorSubmessage(
+    protobuf::io::CodedInputStream* input, TensorProto* tensor_meta) {
+  bool seen_tensor_content = false;
+  while (true) {
+    auto p = input->ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+    if (!p.second) {
+      bool ok = (tag == 0);
+      if (ok && !seen_tensor_content) {
+        // No tensor content: could be because it's a zero-length tensor
+        TensorShape shape(tensor_meta->tensor_shape());
+        Tensor t(allocator_, tensor_meta->dtype(), shape);
+        tensor_ = std::move(t);
+      }
+      return ok;
+    }
+    switch (tag) {
+      case TensorProto::kDtypeFieldNumber: {
+        uint32 v;
+        if ((wt != WIRETYPE_VARINT) || !input->ReadVarint32(&v)) return false;
+        if (seen_tensor_content) return false;
+        tensor_meta->set_dtype(static_cast<DataType>(static_cast<int>(v)));
+        if (!DataTypeCanUseMemcpy(tensor_meta->dtype())) return false;
+        break;
+      }
+      case TensorProto::kTensorShapeFieldNumber: {
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) ||
+            !ReadNestedMessage(input, tensor_meta->mutable_tensor_shape()))
+          return false;
+        if (seen_tensor_content) return false;
+        break;
+      }
+      case TensorProto::kVersionNumberFieldNumber: {
+        uint32 v;
+        if ((wt != WIRETYPE_VARINT) || !input->ReadVarint32(&v)) return false;
+        if (seen_tensor_content) return false;
+        tensor_meta->set_version_number(static_cast<int32>(v));
+        break;
+      }
+      case TensorProto::kTensorContentFieldNumber: {
+        // If we haven't seen the dtype and tensor_shape data first, we can't
+        // deal with this in the fast path.
+        if (seen_tensor_content) return false;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !tensor_meta->has_tensor_shape()) {
+          return false;
+        }
+        int num_bytes;
+        if (!ReadVarintSizeAsInt(input, &num_bytes)) return false;
+        seen_tensor_content = true;
+        TensorShape shape(tensor_meta->tensor_shape());
+        Tensor t(allocator_, tensor_meta->dtype(), shape);
+        StringPiece buf = t.tensor_data();
+        if (num_bytes != buf.size()) return false;
+        // TODO(jeff,sanjay): Figure out a way to avoid this copy if
+        // the underlying ZeroCopyInputStream data is properly aligned
+        // and compatible with what allocator_ wants.
+        if (!input->ReadRaw(const_cast<char*>(buf.data()), num_bytes))
+          return false;
+        tensor_ = std::move(t);
+        break;
+      }
+      default: {
+        // Some other tag our fast path code is not prepared to handle.
+        // return false.
+        return false;
+      }
+    }
+  }
+}
+
+bool TensorResponse::ParseFast(Source* source) {
+  protobuf::io::CodedInputStream input(source->contents());
+  while (true) {
+    auto p = input.ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+    if (!p.second) {
+      return (tag == 0);
+    }
+    switch (tag) {
+      case RecvTensorResponse::kTensorFieldNumber: {
+        if (wt != WIRETYPE_LENGTH_DELIMITED) return false;
+
+        int length;
+        if (!ReadVarintSizeAsInt(&input, &length)) return false;
+        std::pair<protobuf::io::CodedInputStream::Limit, int> p =
+            input.IncrementRecursionDepthAndPushLimit(length);
+        if (p.second < 0 ||
+            !ParseTensorSubmessage(&input, meta_.mutable_tensor())) {
+          return false;
+        }
+        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
+          return false;
+        }
+        break;
+      }
+      case RecvTensorResponse::kIsDeadFieldNumber: {
+        uint32 v;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) return false;
+        meta_.set_is_dead((v != 0) ? true : false);
+        break;
+      }
+      case RecvTensorResponse::kSendStartMicrosFieldNumber: {
+        uint64 v;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) return false;
+        meta_.set_send_start_micros(static_cast<int64>(v));
+        break;
+      }
+      case RecvTensorResponse::kTransportOptionsFieldNumber: {
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) ||
+            !ReadNestedMessage(&input, meta_.mutable_transport_options()))
+          return false;
+        break;
+      }
+      default: {
+        // Unknown tag, so don't handle we can't handle on the fast path
+        return false;
+      }
+    }
+  }
+
+  return false;
+}
+
+bool TensorResponse::ParseSlow(Source* source) {
+  if (!meta_.ParseFromZeroCopyStream(source->contents())) {
+    return false;
+  }
+
+  Tensor parsed(meta_.tensor().dtype());
+  if (!parsed.FromProto(allocator_, meta_.tensor())) {
+    return false;
+  }
+  tensor_ = std::move(parsed);
+
+  // Reduce memory usage for big tensors.
+  {
+    TensorProto empty;
+    meta_.mutable_tensor()->Swap(&empty);
+  }
+  meta_.clear_tensor();
+
+  return true;
+}
+
+void TensorResponse::Clear() {
+  meta_.Clear();
+  tensor_ = Tensor();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.h b/tensorflow/core/distributed_runtime/tensor_coding.h
new file mode 100644
index 00000000000..e193b0776d1
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/tensor_coding.h
@@ -0,0 +1,85 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_TENSOR_CODING_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_TENSOR_CODING_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace tensorflow {
+
+class Allocator;
+class TensorProto;
+
+// TensorResponse can be used as the destination of an RPC that returns
+// a RecvTensorResponse.  It efficiently decodes the incoming data
+// into Tensor contents as well as associated metadata.
+class TensorResponse {
+ public:
+  explicit TensorResponse(Allocator* allocator);
+
+  // Source provides a way for a particular RPC implementation to provide
+  // received data to ParseFrom.
+  class Source {
+   public:
+    // Return the stream that contains the data to be parsed.
+    // Note that this method might be invoked more than once if
+    // ParseFrom needs to fall back to a more expensive parsing method.
+    // Every call must return a stream pointing at the beginning of
+    // the serialized RecvTensorResponse.
+    //
+    // Note that a subsequent call to contents() invalidates previous
+    // results of contents().
+    //
+    // Ownership of the returned stream is retained by the Source and
+    // should not be deleted by the caller.
+    virtual ::tensorflow::protobuf::io::ZeroCopyInputStream* contents() = 0;
+  };
+
+  // Parse the RecvTensorResponse encoded in the data yielded by
+  // source->contents() into *this.
+  Status ParseFrom(Source* source);
+
+  // Return a reference to the parsed tensor.  The tensor will remain
+  // live only until *this is destroyed or modified.
+  const Tensor& tensor() const { return tensor_; }
+
+  // Return a reference to the parsed tensor metadata (no contents).
+  // The result will remain live only until *this is destroyed or
+  // modified.
+  const RecvTensorResponse& metadata() const { return meta_; }
+
+  // Clear contents of *this.
+  void Clear();
+
+ private:
+  bool ParseTensorSubmessage(protobuf::io::CodedInputStream* input,
+                             TensorProto* tensor_meta);
+  bool ParseFast(Source* source);
+  bool ParseSlow(Source* source);
+
+  Allocator* allocator_ = nullptr;
+  bool already_used_ = false;
+  Tensor tensor_;
+  RecvTensorResponse meta_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_TENSOR_CODING_H_
diff --git a/tensorflow/core/distributed_runtime/tensor_coding_test.cc b/tensorflow/core/distributed_runtime/tensor_coding_test.cc
new file mode 100644
index 00000000000..0b1d3b61896
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/tensor_coding_test.cc
@@ -0,0 +1,186 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/tensor_coding.h"
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace tensorflow {
+
+class StringSource : public TensorResponse::Source {
+ public:
+  explicit StringSource(const string* s, int block_size)
+      : s_(s), stream_(nullptr), block_size_(block_size) {}
+  virtual ~StringSource() { DeleteStream(); }
+
+  protobuf::io::ZeroCopyInputStream* contents() {
+    DeleteStream();
+    stream_ = new (&space_)
+        protobuf::io::ArrayInputStream(s_->data(), s_->size(), block_size_);
+    return stream_;
+  }
+
+  void DeleteStream() {
+    if (stream_) {
+      stream_->~ArrayInputStream();
+    }
+  }
+
+ private:
+  const string* s_;
+  protobuf::io::ArrayInputStream* stream_;
+  char space_[sizeof(protobuf::io::ArrayInputStream)];
+  int block_size_;
+};
+
+class TensorResponseTest : public ::testing::Test {
+ public:
+  void Validate(const Tensor& src, bool is_dead, bool use_tensor_content) {
+    RecvTensorResponse proto;
+    proto.set_is_dead(is_dead);
+    proto.set_send_start_micros(123456);
+    if (use_tensor_content) {
+      src.AsProtoTensorContent(proto.mutable_tensor());
+    } else {
+      src.AsProtoField(proto.mutable_tensor());
+    }
+    string encoded;
+    proto.AppendToString(&encoded);
+
+    StringSource source(&encoded, 1024);
+
+    TensorResponse response(cpu_allocator());
+    for (int i = 0; i < 2; i++) {  // Twice so we exercise reuse of "response"
+      Status s = response.ParseFrom(&source);
+      EXPECT_TRUE(s.ok());
+
+      const RecvTensorResponse& meta = response.metadata();
+      EXPECT_EQ(meta.is_dead(), is_dead);
+      EXPECT_EQ(meta.send_start_micros(), 123456);
+
+      const Tensor& result = response.tensor();
+      EXPECT_EQ(result.dtype(), src.dtype());
+      EXPECT_EQ(result.shape().DebugString(), src.shape().DebugString());
+      EXPECT_EQ(result.DebugString(), src.DebugString());
+    }
+  }
+
+  template <typename T>
+  void DoTest(DataType dt) {
+    gtl::InlinedVector<T, 4> v;
+    LOG(ERROR) << "DT: " << static_cast<int>(dt);
+    for (int elems = 0; elems <= 10000; elems++) {
+      if (elems < 100 || (elems % 1000 == 0)) {
+        Tensor a(dt, TensorShape({1, static_cast<int64>(v.size())}));
+        test::FillValues<T>(&a, v);
+        Validate(a, (elems == 0), true);
+      }
+      v.push_back(static_cast<T>(elems));
+    }
+  }
+  void DoTestForStrings(DataType dt) {
+    gtl::InlinedVector<string, 4> v;
+    LOG(ERROR) << "DT: string";
+    for (int elems = 0; elems <= 10000; elems++) {
+      if (elems < 100 || (elems % 1000 == 0)) {
+        Tensor a(dt, TensorShape({1, static_cast<int64>(v.size())}));
+        test::FillValues<string>(&a, v);
+        Validate(a, (elems == 0), true);
+      }
+      v.push_back(strings::StrCat("This is string ", elems));
+    }
+  }
+};
+
+TEST_F(TensorResponseTest, Simple) {
+  DoTest<float>(DT_FLOAT);
+  DoTest<double>(DT_DOUBLE);
+  DoTest<int32>(DT_INT32);
+  DoTest<uint16>(DT_UINT16);
+  DoTest<uint8>(DT_UINT8);
+  DoTest<int16>(DT_INT16);
+  DoTest<int8>(DT_INT8);
+  DoTest<complex64>(DT_COMPLEX64);
+  DoTest<complex128>(DT_COMPLEX128);
+  DoTest<int64>(DT_INT64);
+  DoTest<bool>(DT_BOOL);
+  DoTest<qint8>(DT_QINT8);
+  DoTest<quint8>(DT_QUINT8);
+  DoTest<qint16>(DT_QINT16);
+  DoTest<quint16>(DT_QUINT16);
+  DoTest<qint32>(DT_QINT32);
+  DoTest<bfloat16>(DT_BFLOAT16);
+  DoTest<Eigen::half>(DT_HALF);
+}
+
+TEST_F(TensorResponseTest, StringTensor) { DoTestForStrings(DT_STRING); }
+
+string MakeFloatTensorTestCase(int num_elems) {
+  std::vector<int8> v(num_elems);
+  for (int i = 0; i < num_elems; i++) {
+    v[i] = i % 10;
+  }
+  Tensor src(DT_INT8, TensorShape({1, static_cast<int64>(v.size())}));
+  test::FillValues<int8>(&src, v);
+
+  RecvTensorResponse proto;
+  proto.set_is_dead(false);
+  proto.set_send_start_micros(123456);
+  src.AsProtoTensorContent(proto.mutable_tensor());
+  string encoded;
+  proto.AppendToString(&encoded);
+  return encoded;
+}
+
+static void BM_TensorResponse(int iters, int arg) {
+  testing::StopTiming();
+  string encoded = MakeFloatTensorTestCase(arg);
+  testing::StartTiming();
+  while (--iters > 0) {
+    TensorResponse response(cpu_allocator());
+    StringSource source(&encoded, -1);
+    Status s = response.ParseFrom(&source);
+    if (iters == 1) {
+      testing::SetLabel(
+          strings::StrCat("Bytes: ", response.tensor().TotalBytes()));
+    }
+  }
+}
+BENCHMARK(BM_TensorResponse)->Arg(0)->Arg(1000)->Arg(100000);
+
+static void BM_TensorViaTensorProto(int iters, int arg) {
+  testing::StopTiming();
+  string encoded = MakeFloatTensorTestCase(arg);
+  testing::StartTiming();
+  while (--iters > 0) {
+    RecvTensorResponse r;
+    r.ParseFromString(encoded);
+    Tensor t;
+    CHECK(t.FromProto(r.tensor()));
+    if (iters == 1) {
+      testing::SetLabel(strings::StrCat("Bytes: ", t.TotalBytes()));
+    }
+  }
+}
+BENCHMARK(BM_TensorViaTensorProto)->Arg(0)->Arg(1000)->Arg(100000);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker_cache.h b/tensorflow/core/distributed_runtime/worker_cache.h
index 3efe14998fb..c46c0561364 100644
--- a/tensorflow/core/distributed_runtime/worker_cache.h
+++ b/tensorflow/core/distributed_runtime/worker_cache.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/core/distributed_runtime/worker_interface.h"  // for CallOptions
+#include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"  // for BusAdjacency
 #include "tensorflow/core/lib/core/status.h"
 
@@ -28,7 +28,6 @@ typedef std::function<void(const Status&)> StatusCallback;
 
 class ChannelCache;
 class StepStats;
-class WorkerInterface;
 
 class WorkerCacheInterface {
  public:
@@ -46,6 +45,17 @@ class WorkerCacheInterface {
   // ownership, not a cache lookup.
   virtual WorkerInterface* CreateWorker(const string& target) = 0;
 
+  // Release a worker previously returned by this->CreateWorker(target).
+  //
+  // TODO(jeff,sanjay): Consider moving target into WorkerInterface.
+  // TODO(jeff,sanjay): Consider disallowing direct deletion of WorkerInterface.
+  // TODO(jeff,sanjay): Unify all worker-cache impls and factor out a
+  //                    per-rpc-subsystem WorkerInterface creator.
+  virtual void ReleaseWorker(const string& target, WorkerInterface* worker) {
+    // Subclasses may override to reuse worker objects.
+    delete worker;
+  }
+
   // Set *ba with the BusAdjacency of the specified remote device
   // within its local environment.  Returns true if the device bus
   // affinity was set, using only locally cached data.  Returns false
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 4676db6ab60..5dfaa160d0c 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -173,17 +173,6 @@ Status BiasAddGradShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
-namespace {
-Status CheckKnownDim(shape_inference::InferenceContext* c, const Dimension* dim,
-                     const char* name) {
-  if (!c->ValueKnown(dim)) {
-    return errors::InvalidArgument("Cannot infer shape because dimension ",
-                                   name, " is not known.");
-  }
-  return Status::OK();
-}
-}  // namespace
-
 Status Conv2DShape(shape_inference::InferenceContext* c) {
   const Shape* input_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
@@ -224,10 +213,10 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   const Dimension* output_depth_dim = c->Dim(filter_shape, 3);
 
   // At the moment we need to know the values of several fields.
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_rows_dim, "in_rows"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_cols_dim, "in_cols"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, filter_rows_dim, "filter_rows"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, filter_cols_dim, "filter_cols"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_rows_dim, "filter_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_cols_dim, "filter_cols"));
 
   auto in_rows = c->Value(in_rows_dim);
   auto in_cols = c->Value(in_cols_dim);
@@ -263,6 +252,75 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+Status Conv3DShape(shape_inference::InferenceContext* c) {
+  const Shape* input_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 5, &input_shape));
+  const Shape* filter_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 5, &filter_shape));
+
+  std::vector<int32> strides;
+  TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
+  if (strides.size() != 5) {
+    return errors::InvalidArgument(
+        "Conv3D requires the stride attribute to contain 5 values, but got: ",
+        strides.size());
+  }
+
+  int32 stride_planes = strides[1];
+  int32 stride_rows = strides[2];
+  int32 stride_cols = strides[3];
+
+  const Dimension* batch_size_dim = c->Dim(input_shape, 0);
+  const Dimension* in_planes_dim = c->Dim(input_shape, 1);
+  const Dimension* in_rows_dim = c->Dim(input_shape, 2);
+  const Dimension* in_cols_dim = c->Dim(input_shape, 3);
+
+  const Dimension* filter_planes_dim = c->Dim(filter_shape, 0);
+  const Dimension* filter_rows_dim = c->Dim(filter_shape, 1);
+  const Dimension* filter_cols_dim = c->Dim(filter_shape, 2);
+  const Dimension* output_depth_dim = c->Dim(filter_shape, 4);
+
+  // At the moment we need to know the values of several fields.
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_planes_dim, "in_planes"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_planes_dim, "filter_planes"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_rows_dim, "filter_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_cols_dim, "filter_cols"));
+
+  auto in_planes = c->Value(in_planes_dim);
+  auto in_rows = c->Value(in_rows_dim);
+  auto in_cols = c->Value(in_cols_dim);
+  auto filter_planes = c->Value(filter_planes_dim);
+  auto filter_rows = c->Value(filter_rows_dim);
+  auto filter_cols = c->Value(filter_cols_dim);
+
+  const Dimension* unused;
+  TF_RETURN_IF_ERROR(
+      c->Merge(c->Dim(input_shape, 4), c->Dim(filter_shape, 3), &unused));
+
+  Padding padding;
+  TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
+
+  int64 output_planes, output_rows, output_cols;
+  int64 padding_before, padding_after;
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_planes, filter_planes, stride_planes, padding, &output_planes,
+      &padding_before, &padding_after));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_rows, filter_rows, stride_rows, padding, &output_rows, &padding_before,
+      &padding_after));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_cols, filter_cols, stride_cols, padding, &output_cols, &padding_before,
+      &padding_after));
+
+  const Shape* output_shape =
+      c->MakeShape({batch_size_dim, output_planes, output_rows, output_cols,
+                    output_depth_dim});
+  c->set_output(0, output_shape);
+  return Status::OK();
+}
+
 Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c) {
   const Shape* input_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
@@ -288,12 +346,12 @@ Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c) {
   const Dimension* depth_multiplier = c->Dim(filter_shape, 3);
 
   // At the moment we need to know the values of several fields.
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_rows_dim, "in_rows"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_cols_dim, "in_cols"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, filter_rows_dim, "filter_rows"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, filter_cols_dim, "filter_cols"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, input_depth, "depth"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, depth_multiplier, "depth_multiplier"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_rows_dim, "filter_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_cols_dim, "filter_cols"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(input_depth, "depth"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(depth_multiplier, "depth_multiplier"));
 
   // Check that the input depths are compatible.
   TF_RETURN_IF_ERROR(
@@ -380,8 +438,8 @@ Status AvgPoolShape(shape_inference::InferenceContext* c) {
   const Dimension* output_depth_dim = c->Dim(input_shape, 3);
 
   // At the moment we need to know the values of several fields.
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_rows_dim, "in_rows"));
-  TF_RETURN_IF_ERROR(CheckKnownDim(c, in_cols_dim, "in_cols"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
 
   Padding padding;
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
@@ -414,5 +472,177 @@ Status AvgPoolShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+Status MaxPoolShape(shape_inference::InferenceContext* c) {
+  const Shape* input_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+
+  string data_format;
+  Status s = c->GetAttr("data_format", &data_format);
+
+  std::vector<int32> strides;
+  TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
+  if (strides.size() != 4) {
+    return errors::InvalidArgument(
+        "AvgPool requires the stride attribute to contain 4 values, but "
+        "got: ",
+        strides.size());
+  }
+
+  std::vector<int32> kernel_sizes;
+  TF_RETURN_IF_ERROR(c->GetAttr("ksize", &kernel_sizes));
+  if (kernel_sizes.size() != 4) {
+    return errors::InvalidArgument(
+        "AvgPool requires the ksize attribute to contain 4 values, but got: ",
+        kernel_sizes.size());
+  }
+
+  int32 stride_rows, stride_cols, stride_depth;
+  int32 kernel_rows, kernel_cols, kernel_depth;
+
+  if (s.ok() && data_format == "NCHW") {
+    // Convert input shape to default NHWC for inference
+    input_shape =
+        c->MakeShape({{c->Dim(input_shape, 0), c->Dim(input_shape, 2),
+                       c->Dim(input_shape, 3), c->Dim(input_shape, 1)}});
+    stride_depth = strides[1];
+    stride_rows = strides[2];
+    stride_cols = strides[3];
+    kernel_depth = kernel_sizes[1];
+    kernel_rows = kernel_sizes[2];
+    kernel_cols = kernel_sizes[3];
+  } else {
+    stride_rows = strides[1];
+    stride_cols = strides[2];
+    stride_depth = strides[3];
+    kernel_rows = kernel_sizes[1];
+    kernel_cols = kernel_sizes[2];
+    kernel_depth = kernel_sizes[3];
+  }
+
+  const Dimension* batch_size_dim = c->Dim(input_shape, 0);
+  const Dimension* in_rows_dim = c->Dim(input_shape, 1);
+  const Dimension* in_cols_dim = c->Dim(input_shape, 2);
+  const Dimension* in_depth_dim = c->Dim(input_shape, 3);
+
+  // At the moment we need to know the values of several fields.
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_depth_dim, "in_depth"));
+
+  Padding padding;
+  TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
+
+  // TODO(mrry,shlens): Raise an error if the stride would cause
+  // information in the input to be ignored. This will require a change
+  // in the kernel implementation.
+  auto in_rows = c->Value(in_rows_dim);
+  auto in_cols = c->Value(in_cols_dim);
+  auto in_depth = c->Value(in_depth_dim);
+
+  int64 output_rows, output_cols, output_depth;
+  int64 padding_before, padding_after;
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_rows, kernel_rows, stride_rows, padding, &output_rows, &padding_before,
+      &padding_after));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_cols, kernel_cols, stride_cols, padding, &output_cols, &padding_before,
+      &padding_after));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_depth, kernel_depth, stride_depth, padding, &output_depth,
+      &padding_before, &padding_after));
+
+  const Shape* output_shape =
+      c->MakeShape({batch_size_dim, output_rows, output_cols, output_depth});
+
+  if (data_format == "NCHW") {
+    // Convert output shape back to expected NCHW data format.
+    output_shape =
+        c->MakeShape({c->Dim(output_shape, 0), c->Dim(output_shape, 3),
+                      c->Dim(output_shape, 1), c->Dim(output_shape, 2)});
+  }
+
+  c->set_output(0, output_shape);
+  return Status::OK();
+}
+
+Status Pool3DShape(shape_inference::InferenceContext* c) {
+  const Shape* input_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 5, &input_shape));
+
+  std::vector<int32> strides;
+  TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
+  if (strides.size() != 5) {
+    return errors::InvalidArgument(
+        "Pool3D ops require the stride attribute to contain 5 values, but "
+        "got: ",
+        strides.size());
+  }
+
+  std::vector<int32> kernel_sizes;
+  TF_RETURN_IF_ERROR(c->GetAttr("ksize", &kernel_sizes));
+  if (kernel_sizes.size() != 5) {
+    return errors::InvalidArgument(
+        "Pool3D requires the ksize attribute to contain 5 values, but got: ",
+        kernel_sizes.size());
+  }
+
+  int32 stride_planes, stride_rows, stride_cols;
+  int32 kernel_planes, kernel_rows, kernel_cols;
+
+  stride_planes = strides[1];
+  stride_rows = strides[2];
+  stride_cols = strides[3];
+  kernel_planes = kernel_sizes[1];
+  kernel_rows = kernel_sizes[2];
+  kernel_cols = kernel_sizes[3];
+
+  const Dimension* batch_size_dim = c->Dim(input_shape, 0);
+  const Dimension* in_planes_dim = c->Dim(input_shape, 1);
+  const Dimension* in_rows_dim = c->Dim(input_shape, 2);
+  const Dimension* in_cols_dim = c->Dim(input_shape, 3);
+  const Dimension* output_depth_dim = c->Dim(input_shape, 4);
+
+  // At the moment we need to know the values of several fields.
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_planes_dim, "in_planes"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+  TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
+
+  Padding padding;
+  TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
+
+  // TODO(mrry,shlens): Raise an error if the stride would cause
+  // information in the input to be ignored. This will require a change
+  // in the kernel implementation.
+  auto in_planes = c->Value(in_planes_dim);
+  auto in_rows = c->Value(in_rows_dim);
+  auto in_cols = c->Value(in_cols_dim);
+
+  int64 output_planes, output_rows, output_cols;
+  int64 padding_before, padding_after;
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_planes, kernel_planes, stride_planes, padding, &output_planes,
+      &padding_before, &padding_after));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_rows, kernel_rows, stride_rows, padding, &output_rows, &padding_before,
+      &padding_after));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+      in_cols, kernel_cols, stride_cols, padding, &output_cols, &padding_before,
+      &padding_after));
+
+  const Shape* output_shape =
+      c->MakeShape({batch_size_dim, output_planes, output_rows, output_cols,
+                    output_depth_dim});
+
+  c->set_output(0, output_shape);
+  return Status::OK();
+}
+
+Status UnknownShape(shape_inference::InferenceContext* c) {
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->UnknownShape());
+  }
+  return Status::OK();
+}
+
 }  // namespace shape_inference
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index a4c9946c6a3..0ca64990365 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -157,12 +157,24 @@ Status BiasAddGradShape(shape_inference::InferenceContext* c);
 // Shape function for Conv2D-like operations.
 Status Conv2DShape(shape_inference::InferenceContext* c);
 
+// Shape function for Conv3D-like operations.
+Status Conv3DShape(shape_inference::InferenceContext* c);
+
 // Shape function for DepthwiseConv2D-like operations.
 Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c);
 
 // Shape function for AvgPool-like operations.
 Status AvgPoolShape(shape_inference::InferenceContext* c);
 
+// Shape function for MaxPool-like operations.
+Status MaxPoolShape(shape_inference::InferenceContext* c);
+
+// Shape function for 3D Pooling operations.
+Status Pool3DShape(shape_inference::InferenceContext* c);
+
+// Shape function for use with ops whose output shapes are unknown.
+Status UnknownShape(shape_inference::InferenceContext* c);
+
 }  // namespace shape_inference
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 6da9a77cb7d..6e0dd7f742d 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -419,6 +419,55 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,4,4,d1_3]");
 }
 
+TEST(CommonShapeFnsTest, Conv3DShapeTest) {
+  ShapeInferenceTestOp op("Conv3D");
+  auto set_op = [&op](const std::vector<int32>& strides,
+                      const string& padding) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Conv3D")
+                    .Input("input", 0, DT_FLOAT)
+                    .Input("filter", 0, DT_FLOAT)
+                    .Attr("strides", strides)
+                    .Attr("padding", padding)
+                    .Finalize(&op.node_def));
+  };
+
+  // 1x1x1 filter
+  set_op({{1, 1, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,2,2,2,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
+
+  // Invalid rank for input
+  INFER_ERROR("must be rank 5", op, "[4,4];[2,1,1,1]");
+  // Invalid rank for filter
+  INFER_ERROR("must be rank 5", op, "[1,4,4,1];[2,1,1]");
+
+  // No unknown dims in the critical fields.
+  INFER_ERROR("is not known", op, "[1,?,2,2,1];[1,1,1,1,1]");
+  INFER_ERROR("is not known", op, "[1,2,?,2,1];[1,1,1,1,1]");
+  INFER_ERROR("is not known", op, "[1,2,2,?,1];[1,1,1,1,1]");
+  INFER_ERROR("is not known", op, "[1,2,2,2,1];[?,1,1,1,1]");
+  INFER_ERROR("is not known", op, "[1,2,2,2,1];[1,?,1,1,1]");
+
+  // input depths must match.
+  INFER_ERROR("Dimensions must be equal, but are 10 and 10000", op,
+              "[1,2,2,2,10];[1,1,1,10000,20]");
+
+  // 2x2x2 filter
+  set_op({{1, 1, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,2,2,2,1];[2,2,2,1,1]", "[d0_0,1,1,1,d1_4]");
+
+  // 3x3 input, 1x1 filter, 2x2 stride
+  set_op({{1, 2, 2, 2, 1}}, "VALID");
+  INFER_OK(op, "[1,3,3,3,1];[1,1,1,1,1]", "[d0_0,2,2,2,d1_4]");
+
+  // 3x3 input, 1x1 filter, 2x1x1 stride
+  set_op({{1, 2, 1, 1, 1}}, "VALID");
+  INFER_OK(op, "[1,3,3,3,1];[1,1,1,1,1]", "[d0_0,2,3,3,d1_4]");
+
+  // 4x4 input, 2x2 filter, 1x1 stride
+  set_op({{1, 1, 1, 1, 1}}, "SAME");
+  INFER_OK(op, "[1,4,4,4,1];[2,2,2,1,1]", "[d0_0,4,4,4,d1_4]");
+}
+
 TEST(CommonShapeFnsTest, DepthwiseConv2DShapeTest) {
   ShapeInferenceTestOp op("DepthwiseConv2dNative");
   std::vector<int32> strides = {{1, 1, 1, 1}};
@@ -485,5 +534,74 @@ TEST(CommonShapeFnsTest, AvgPool2DShapeTest) {
   INFER_ERROR("must be rank 4", op, "[4,4]");
 }
 
+TEST(CommonShapeFnsTest, MaxPool2DShapeTest) {
+  ShapeInferenceTestOp op("MaxPool");
+  auto set_op = [&op](const std::vector<int32>& strides,
+                      const std::vector<int32>& ksizes, const string& padding,
+                      const string& data_format) {
+    TF_CHECK_OK(NodeDefBuilder("test", "MaxPool")
+                    .Input("input", 0, DT_FLOAT)
+                    .Attr("strides", strides)
+                    .Attr("ksize", ksizes)
+                    .Attr("padding", padding)
+                    .Attr("data_format", data_format)
+                    .Finalize(&op.node_def));
+  };
+
+  // Most of the functionality is tested by conv-like shapes,
+  // so we check the very-specific maxpooling features here,
+  // namely depthwise kernel and striding.
+
+  // all 1 strides, depth 2 filter
+  set_op({1, 1, 1, 1}, {1, 1, 1, 2}, "VALID", "NHWC");
+  INFER_OK(op, "[1,2,2,2]", "[d0_0,2,2,1]");
+
+  // depth 3 stride, 1x1x1 filter, NCHW
+  set_op({1, 3, 1, 1}, {1, 1, 1, 1}, "VALID", "NCHW");
+  INFER_OK(op, "[1,7,5,5]", "[d0_0,3,5,5]");
+}
+
+TEST(CommonShapeFnsTest, Pool3DShapeTest) {
+  ShapeInferenceTestOp op("MaxPool3D");
+  auto set_op = [&op](const std::vector<int32>& strides,
+                      const std::vector<int32>& ksizes, const string& padding) {
+    TF_CHECK_OK(NodeDefBuilder("test", "MaxPool3D")
+                    .Input("input", 0, DT_FLOAT)
+                    .Attr("strides", strides)
+                    .Attr("ksize", ksizes)
+                    .Attr("padding", padding)
+                    .Finalize(&op.node_def));
+  };
+
+  // Most of the functionality is tested by conv-like shapes,
+  // so we check that we handle the extra dimension properly.
+
+  // 2x3x4 stride, 1x1x1 filter.
+  set_op({1, 2, 3, 4, 1}, {1, 1, 1, 1, 1}, "VALID");
+  INFER_OK(op, "[1,24,24,24,1]", "[d0_0,12,8,6,d0_4]");
+}
+
+TEST(CommonShapeFnsTest, UnknownShapeTest) {
+  {
+    // Single output
+    ShapeInferenceTestOp op("QueueDequeue");
+    TF_CHECK_OK(NodeDefBuilder("test", "QueueDequeue")
+                    .Input("handle", 0, DT_STRING_REF)
+                    .Attr("component_types", {DT_FLOAT})
+                    .Finalize(&op.node_def));
+    INFER_OK(op, "[1]", "?");
+  }
+
+  {
+    // Multiple outputs
+    ShapeInferenceTestOp op("QueueDequeue");
+    TF_CHECK_OK(NodeDefBuilder("test", "QueueDequeue")
+                    .Input("handle", 0, DT_STRING_REF)
+                    .Attr("component_types", {DT_FLOAT, DT_FLOAT, DT_STRING})
+                    .Finalize(&op.node_def));
+    INFER_OK(op, "[1]", "?;?;?");
+  }
+}
+
 }  // namespace shape_inference
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 03d4bde37b0..52afde9fac3 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -360,6 +360,10 @@ class FunctionLibraryRuntime {
 
   // Return the device on which the function executes.
   virtual Device* device() = 0;
+
+  // Returns the function library definition that backs this runtime.
+  virtual const FunctionLibraryDefinition* GetFunctionLibraryDefinition()
+      const = 0;
 };
 
 // To register a gradient function for a builtin op, one should use
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index d5382284946..6e37a3aba47 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -40,10 +40,6 @@ int GetTotal(const NameRangeMap& name_map) {
 void MemoryTypesHelper(const NameRangeMap& name_map,
                        std::vector<string>* host_memory_args,
                        MemoryTypeVector* memory_types) {
-  // Now that we know the size, fill with the default 'DEVICE_MEMORY'.
-  memory_types->clear();
-  memory_types->resize(GetTotal(name_map), DEVICE_MEMORY);
-
   // Update args that have been marked as in "HOST_MEMORY".
   size_t keep = 0;
   for (size_t i = 0; i < host_memory_args->size(); ++i) {
@@ -65,15 +61,27 @@ MemoryType MTypeFromDType(const DataType dtype) {
   return (dtype == DT_INT32) ? HOST_MEMORY : DEVICE_MEMORY;
 }
 
-// Returns true if an arg of op_def's input/output is a type list.
-bool HasTypeList(const OpDef& op_def) {
-  for (const auto& a : op_def.input_arg()) {
-    if (!a.type_list_attr().empty()) return true;
+// Initialize the default memory types for type list arguments from the data
+// types. (The default can be overridden by an explicit HostMemory()
+// declaration.)
+Status SetTypeListMTypesFromDTypes(
+    const NameRangeMap& name_ranges,
+    const protobuf::RepeatedPtrField<OpDef::ArgDef>& args,
+    const DataTypeVector& dtypes, MemoryTypeVector* mtypes) {
+  for (const auto& a : args) {
+    if (!a.type_list_attr().empty()) {
+      auto it = name_ranges.find(a.name());
+      if (it == name_ranges.end()) {
+        return errors::InvalidArgument("Name range for argument ", a.name(),
+                                       " not found.");
+      }
+
+      for (int i = it->second.first; i < it->second.second; ++i) {
+        (*mtypes)[i] = MTypeFromDType(dtypes[i]);
+      }
+    }
   }
-  for (const auto& a : op_def.output_arg()) {
-    if (!a.type_list_attr().empty()) return true;
-  }
-  return false;
+  return Status::OK();
 }
 
 }  // namespace
@@ -91,20 +99,21 @@ Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
   Status status =
       FindKernelDef(device_type, ndef, &kdef, nullptr /* kernel_class_name */);
 
-  if (!status.ok() || HasTypeList(*op_def)) {
-    // When there is no kernel def for this op or the op's arg is a
-    // type list, we can only best-effort derive the memory type from
-    // the data type.  For now, we assume int32 is always on host
-    // memory and other types are always on device memory. We should
+  DataTypeVector inp_dtypes;
+  DataTypeVector out_dtypes;
+  TF_RETURN_IF_ERROR(
+      InOutTypesForNode(ndef, *op_def, &inp_dtypes, &out_dtypes));
+
+  inp_mtypes->clear();
+  out_mtypes->clear();
+
+  if (!status.ok()) {
+    // When there is no kernel def for this op, we can only best-effort derive
+    // the memory type from the data type.  For now, we assume int32 is always
+    // on host memory and other types are always on device memory. We should
     // do type inference over function body to derive the correct
     // input/output memory types.
-    DataTypeVector inp_dtypes;
-    DataTypeVector out_dtypes;
-    TF_RETURN_IF_ERROR(
-        InOutTypesForNode(ndef, *op_def, &inp_dtypes, &out_dtypes));
-    inp_mtypes->clear();
     for (const auto& t : inp_dtypes) inp_mtypes->push_back(MTypeFromDType(t));
-    out_mtypes->clear();
     for (const auto& t : out_dtypes) out_mtypes->push_back(MTypeFromDType(t));
     return Status::OK();
   }
@@ -114,6 +123,16 @@ Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
   NameRangeMap out_names;
   TF_RETURN_IF_ERROR(NameRangesForNode(ndef, *op_def, &inp_names, &out_names));
 
+  // Now that we know the size, fill with the default 'DEVICE_MEMORY'.
+  inp_mtypes->resize(GetTotal(inp_names), DEVICE_MEMORY);
+  out_mtypes->resize(GetTotal(out_names), DEVICE_MEMORY);
+
+  // For type list arguments, mark int32 arguments as host memory.
+  TF_RETURN_IF_ERROR(SetTypeListMTypesFromDTypes(inp_names, op_def->input_arg(),
+                                                 inp_dtypes, inp_mtypes));
+  TF_RETURN_IF_ERROR(SetTypeListMTypesFromDTypes(
+      out_names, op_def->output_arg(), out_dtypes, out_mtypes));
+
   // Fills in host memory types based on the kernel def.
   const auto& from_proto = kdef->host_memory_arg();
   std::vector<string> host_memory_args(from_proto.begin(), from_proto.end());
@@ -124,6 +143,7 @@ Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
         "HostMemory args '", str_util::Join(host_memory_args, "', '"),
         "' not found in OpDef: ", SummarizeOpDef(*op_def));
   }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/framework/memory_types_test.cc b/tensorflow/core/framework/memory_types_test.cc
index c4d5886ffb3..c4cd875bc4b 100644
--- a/tensorflow/core/framework/memory_types_test.cc
+++ b/tensorflow/core/framework/memory_types_test.cc
@@ -35,14 +35,18 @@ REGISTER_OP("HostMemoryTest")
     .Input("a: float")
     .Input("b: T")
     .Input("c: N * string")
+    .Input("d: Tlist")
     .Output("o: N * T")
+    .Output("p: Tlist")
     .Attr("T: type")
-    .Attr("N: int");
+    .Attr("N: int")
+    .Attr("Tlist: list(type)");
 REGISTER_KERNEL_BUILDER(Name("HostMemoryTest").Device(DEVICE_CPU), DummyKernel);
 REGISTER_KERNEL_BUILDER(Name("HostMemoryTest")
                             .Device(DEVICE_GPU)
                             .HostMemory("a")
                             .HostMemory("c")
+                            .HostMemory("d")
                             .HostMemory("o"),
                         DummyKernel);
 
@@ -52,20 +56,29 @@ TEST(MemoryTypesForNode, Simple) {
                    .Input(FakeInput())
                    .Input(FakeInput(DT_BOOL))
                    .Input(FakeInput(3))
+                   .Input(FakeInput({DT_INT32, DT_FLOAT, DT_INT32}))
                    .Finalize(&node_def));
   MemoryTypeVector input, output;
 
   TF_EXPECT_OK(MemoryTypesForNode(OpRegistry::Global(), DEVICE_CPU, node_def,
                                   &input, &output));
-  EXPECT_EQ(MemoryTypeVector(5, DEVICE_MEMORY), input);
-  EXPECT_EQ(MemoryTypeVector(3, DEVICE_MEMORY), output);
+  EXPECT_EQ(MemoryTypeVector({DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
+                              DEVICE_MEMORY, DEVICE_MEMORY, HOST_MEMORY,
+                              DEVICE_MEMORY, HOST_MEMORY}),
+            input);
+  EXPECT_EQ(MemoryTypeVector({DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
+                              HOST_MEMORY, DEVICE_MEMORY, HOST_MEMORY}),
+            output);
 
   TF_EXPECT_OK(MemoryTypesForNode(OpRegistry::Global(), DEVICE_GPU, node_def,
                                   &input, &output));
-  EXPECT_EQ(MemoryTypeVector({HOST_MEMORY, DEVICE_MEMORY, HOST_MEMORY,
-                              HOST_MEMORY, HOST_MEMORY}),
-            input);
-  EXPECT_EQ(MemoryTypeVector(3, HOST_MEMORY), output);
+  EXPECT_EQ(
+      MemoryTypeVector({HOST_MEMORY, DEVICE_MEMORY, HOST_MEMORY, HOST_MEMORY,
+                        HOST_MEMORY, HOST_MEMORY, HOST_MEMORY, HOST_MEMORY}),
+      input);
+  EXPECT_EQ(MemoryTypeVector({HOST_MEMORY, HOST_MEMORY, HOST_MEMORY,
+                              HOST_MEMORY, DEVICE_MEMORY, HOST_MEMORY}),
+            output);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index cc2fcced913..3e21dd5cd38 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -93,6 +93,10 @@ OpKernel::OpKernel(OpKernelConstruction* context)
                                    &output_name_map_));
   OP_REQUIRES_OK(context, CheckOpDeprecation(context->op_def(),
                                              context->graph_def_version()));
+
+  // Kernels executing on GPU tie very few resources on the CPU where the
+  // scheduler runs: we consider them as inexpensive.
+  expensive_ = context->device_type() != DeviceType(DEVICE_GPU);
 }
 
 OpKernel::~OpKernel() {}
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 829a1eef4b6..c241e173b43 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -104,7 +104,7 @@ class OpKernel {
   // Returns true iff this op kernel is considered "expensive". The
   // runtime may use this flag to optimize graph execution for example
   // to "inline" inexpensive kernels.
-  virtual bool IsExpensive() { return true; }
+  virtual bool IsExpensive() { return expensive_; }
 
   // Accessors.
   const NodeDef& def() const { return def_; }
@@ -160,6 +160,7 @@ class OpKernel {
   const bool is_internal_;  // True if this is an internal operation
   NameRangeMap input_name_map_;
   NameRangeMap output_name_map_;
+  bool expensive_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernel);
 };
@@ -179,6 +180,8 @@ class AsyncOpKernel : public OpKernel {
   AsyncOpKernel* AsAsync() final { return this; }
 
   void Compute(OpKernelContext* context) final;
+
+  bool IsExpensive() override { return true; }
 };
 
 // Wraps a tensor that is held by an Op across calls to Compute(). For
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index dce44d8d4d8..9c90bfe0f50 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -99,6 +99,14 @@ InferenceContext::~InferenceContext() {
   for (auto* d : all_dims_) delete d;
 }
 
+bool InferenceContext::FullyDefined(const Shape* s) {
+  if (!RankKnown(s)) return false;
+  for (int i = 0; i < Rank(s); ++i) {
+    if (!ValueKnown(Dim(s, i))) return false;
+  }
+  return true;
+}
+
 const Dimension* InferenceContext::NumElements(const Shape* s) {
   const auto rank = Rank(s);
   if (rank == kUnknownRank) return UnknownDim();
@@ -379,12 +387,6 @@ Status InferenceContext::ReplaceDim(const Shape* s, int dim_index_in,
   return ReturnCreatedShape(dims, out);
 }
 
-const Dimension* InferenceContext::GetDimension(const DimensionOrConstant& d) {
-  if (d.dim != nullptr) return d.dim;
-  DCHECK(d.val >= 0 || d.val == kUnknownDim);
-  return MakeDim(d.val);
-}
-
 const Shape* InferenceContext::MakeShape(
     const std::vector<const Dimension*>& dims) {
   all_shapes_.push_back(new Shape(dims));
@@ -396,7 +398,7 @@ const Shape* InferenceContext::MakeShape(
   std::vector<const Dimension*> dims_actual;
   dims_actual.reserve(dims.size());
   for (const DimensionOrConstant& d : dims) {
-    dims_actual.push_back(GetDimension(d));
+    dims_actual.push_back(MakeDim(d));
   }
   return MakeShape(dims_actual);
 }
@@ -480,11 +482,6 @@ Status InferenceContext::MakeShapeFromShapeProto(const TensorShapeProto& proto,
   return ReturnCreatedShape(dims, out);
 }
 
-const Dimension* InferenceContext::MakeDim(int64 value) {
-  all_dims_.push_back(new Dimension(value));
-  return all_dims_.back();
-}
-
 // Returns a new dimension whose value is given by a scalar input tensor.
 Status InferenceContext::MakeDimForScalarInput(int idx, const Dimension** out) {
   const Tensor* t = input_tensor(idx);
@@ -492,6 +489,10 @@ Status InferenceContext::MakeDimForScalarInput(int idx, const Dimension** out) {
     *out = UnknownDim();
     return Status::OK();
   }
+  const int rank = t->dims();
+  if (rank != 0) {
+    return errors::InvalidArgument("Input must be scalar but has rank ", rank);
+  }
 
   int64 val;
   if (t->dtype() == DT_INT32) {
@@ -510,11 +511,6 @@ Status InferenceContext::MakeDimForScalarInput(int idx, const Dimension** out) {
   return Status::OK();
 }
 
-const Dimension* InferenceContext::UnknownDim() {
-  all_dims_.push_back(new Dimension());
-  return all_dims_.back();
-}
-
 Status InferenceContext::Divide(const Dimension* dividend, int64 divisor,
                                 const Dimension** out) {
   if (divisor == 1) {
@@ -523,6 +519,10 @@ Status InferenceContext::Divide(const Dimension* dividend, int64 divisor,
     *out = UnknownDim();
   } else {
     const int64 v = Value(dividend);
+    if (divisor <= 0) {
+      return errors::InvalidArgument("Divisor must be positive but is ",
+                                     divisor);
+    }
     if ((v % divisor) != 0) {
       return errors::InvalidArgument("Dimension size must be divisible by ",
                                      divisor, " but is ", v);
@@ -534,87 +534,112 @@ Status InferenceContext::Divide(const Dimension* dividend, int64 divisor,
 
 Status InferenceContext::Add(const Dimension* first, DimensionOrConstant second,
                              const Dimension** out) {
-  const int64 second_value =
-      second.dim == nullptr ? second.val : Value(second.dim);
-  if (second.dim != nullptr && !ValueKnown(second.dim)) {
-    *out = UnknownDim();
+  const int64 first_value = Value(first);
+  const int64 second_value = Value(second);
+  // Special cases.
+  if (first_value == 0) {
+    *out = MakeDim(second);
   } else if (second_value == 0) {
-    *out = first;
-  } else if (!ValueKnown(first)) {
+    *out = MakeDim(first);
+  } else if (first_value == kUnknownDim || second_value == kUnknownDim) {
     *out = UnknownDim();
   } else {
-    const int64 v = Value(first);
-    const int64 sum = v + second_value;
-    if (second_value > 0 && sum < 0) {
-      return errors::InvalidArgument("Dimension size overflow from adding ", v,
-                                     " and ", second_value);
-    } else if (second_value < 0 && sum < 0) {
-      return errors::InvalidArgument("Negative dimension size from adding ", v,
-                                     " and ", second_value);
+    // Invariant: Both values are known and positive.
+    const int64 sum = first_value + second_value;
+    if (sum < 0) {
+      return errors::InvalidArgument("Dimension size overflow from adding ",
+                                     first_value, " and ", second_value);
     }
     *out = MakeDim(sum);
   }
   return Status::OK();
 }
 
+Status InferenceContext::Subtract(const Dimension* first,
+                                  DimensionOrConstant second,
+                                  const Dimension** out) {
+  const int64 first_value = Value(first);
+  const int64 second_value = Value(second);
+  // Special cases.
+  if (second_value == 0) {
+    *out = MakeDim(first);
+  } else if (first_value == kUnknownDim || second_value == kUnknownDim) {
+    *out = UnknownDim();
+  } else {
+    // Invariant: Both values are known, first_value is non-negative, and
+    // second_value is positive.
+    if (first_value < second_value) {
+      return errors::InvalidArgument(
+          "Negative dimension size caused by subtracting ", second_value,
+          " from ", first_value);
+    }
+    *out = MakeDim(first_value - second_value);
+  }
+  return Status::OK();
+}
+
 Status InferenceContext::Multiply(const Dimension* first,
                                   DimensionOrConstant second,
                                   const Dimension** out) {
-  int64 first_value = -1;
-  // Special cases for multiply are when the values are 0 or 1.
-  if (ValueKnown(first)) {
-    first_value = Value(first);
-    if (first_value == 0) {
-      *out = MakeDim(0);
-      return Status::OK();
-    }
-
-    // Output is whatever the second value is.
-    if (first_value == 1) {
-      *out = GetDimension(second);
-      return Status::OK();
-    }
-  }
-
-  // Same check for when the second argument is a known value.
-  // First find out if the value is known from DimOrConstant.
-  int64 second_value;
-  if (second.dim == nullptr) {
-    second_value = second.val;
-  } else {
-    if (!ValueKnown(second.dim)) {
-      // Second value is not known and first is not a special caase
-      *out = UnknownDim();
-      return Status::OK();
-    }
-    second_value = Value(second.dim);
-  }
-
-  // Now that we know whether the value is known, apply the special
-  // casing.
-  if (second_value == 0) {
-    *out = MakeDim(0);
-    return Status::OK();
-  }
-
-  // Output is whatever the first value is.
-  if (second_value == 1) {
+  const int64 first_value = Value(first);
+  const int64 second_value = Value(second);
+  // Special cases.
+  if (first_value == 0) {
     *out = first;
-    return Status::OK();
-  }
-
-  if (!ValueKnown(first)) {
-    // First value is not known and second is not a special caase
+  } else if (second_value == 0) {
+    *out = MakeDim(second);
+  } else if (first_value == 1) {
+    *out = MakeDim(second);
+  } else if (second_value == 1) {
+    *out = first;
+  } else if (first_value == kUnknownDim || second_value == kUnknownDim) {
     *out = UnknownDim();
-    return Status::OK();
+  } else {
+    // Invariant: Both values are known and and greater than 1.
+    const int64 product = first_value * second_value;
+    if (product < 0) {
+      return errors::InvalidArgument(
+          "Negative dimension size caused by overflow when multiplying ",
+          first_value, " and ", second_value);
+    }
+    *out = MakeDim(product);
   }
+  return Status::OK();
+}
 
-  const int64 product = first_value * second_value;
-  if (product < 0) {
-    return errors::InvalidArgument("Negative dimension size from multiplying ",
-                                   first_value, " and ", second_value);
+Status InferenceContext::Min(const Dimension* first, DimensionOrConstant second,
+                             const Dimension** out) {
+  const int64 first_value = Value(first);
+  const int64 second_value = Value(second);
+  if (first_value == 0) {
+    *out = first;
+  } else if (second_value == 0) {
+    *out = MakeDim(second);
+  } else if (first_value == kUnknownDim || second_value == kUnknownDim) {
+    *out = UnknownDim();
+  } else {
+    if (first_value <= second_value) {
+      *out = first;
+    } else {
+      *out = MakeDim(second);
+    }
+  }
+  return Status::OK();
+}
+
+Status InferenceContext::Max(const Dimension* first, DimensionOrConstant second,
+                             const Dimension** out) {
+  const int64 first_value = Value(first);
+  const int64 second_value = Value(second);
+  if (first_value == kUnknownDim || second_value == kUnknownDim) {
+    *out = UnknownDim();
+  } else {
+    if (first_value >= second_value) {
+      *out = first;
+    } else {
+      *out = MakeDim(second);
+    }
   }
-  *out = MakeDim(product);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index a7a5c50d02d..f35c8a4c815 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -46,7 +46,7 @@ class Dimension {
 class Shape {
  private:
   Shape();
-  Shape(std::vector<const Dimension*> dims);
+  Shape(const std::vector<const Dimension*>& dims);
   ~Shape() {}
 
   const int32 rank_;
@@ -61,13 +61,17 @@ class Shape {
 struct DimensionOrConstant {
  public:
   // Intentionally not explicit.
-  DimensionOrConstant(const Dimension* dim) : dim(dim) {}
+  DimensionOrConstant(const Dimension* dim);
 
   // val must be non-negative or InferenceContext::kUnknownDim.
-  DimensionOrConstant(int64 val) : val(val) {}
+  DimensionOrConstant(int64 val);
 
-  const Dimension* dim = nullptr;
-  int64 val = 0;
+  // dim takes precedence. If dim != nullptr, val is ignored.
+  const Dimension* dim;
+  int64 val;
+
+ private:
+  DimensionOrConstant();
 };
 
 // Note: This is experimental support for op shape inference in C++.  Shape
@@ -81,8 +85,8 @@ struct DimensionOrConstant {
 // by the InferenceContext.
 class InferenceContext {
  public:
-  static constexpr int32 kUnknownRank = -1;
   static constexpr int64 kUnknownDim = -1;
+  static constexpr int32 kUnknownRank = -1;
 
   // This is a temporary constructor used for initial testing.
   //
@@ -127,8 +131,15 @@ class InferenceContext {
   }
   int32 Rank(const Shape* s) { return s->rank_; }
   bool RankKnown(const Shape* s) { return Rank(s) != kUnknownRank; }
-  int64 Value(const Dimension* d) { return d->value_; }
-  bool ValueKnown(const Dimension* d) { return Value(d) != kUnknownDim; }
+  inline int64 Value(DimensionOrConstant d) {
+    return d.dim ? d.dim->value_ : d.val;
+  }
+  inline bool ValueKnown(DimensionOrConstant d) {
+    return Value(d) != kUnknownDim;
+  }
+
+  // Returns true if the rank and all dimensions of the Shape are known.
+  bool FullyDefined(const Shape* s);
 
   // Returns the total number of elements, or an unknown dimension for an
   // incomplete shape.
@@ -229,8 +240,15 @@ class InferenceContext {
 
   // Returns a new dimension of the given size.  The returned value is owned by
   // this context.
-  const Dimension* MakeDim(int64 value);
-  const Dimension* UnknownDim();
+  inline const Dimension* MakeDim(DimensionOrConstant d) {
+    if (d.dim) {
+      return d.dim;
+    } else {
+      all_dims_.push_back(new Dimension(d.val));
+      return all_dims_.back();
+    }
+  }
+  inline const Dimension* UnknownDim() { return MakeDim(kUnknownDim); }
 
   // Returns a new dimension whose value is given by a scalar input tensor.
   // The input tensor must be in host memory, since it is dereferenced to get
@@ -244,7 +262,8 @@ class InferenceContext {
   Status GetAttr(StringPiece attr_name, T* value) const;
 
   // Returns in <out> the result of dividing <dividend> by <divisor>.
-  // Returns an error if <divisor> does not evenly divide <dividend>.
+  // Returns an error if <divisor>  is not positive or does not evenly
+  // divide <dividend>.
   Status Divide(const Dimension* dividend, int64 divisor,
                 const Dimension** out);
 
@@ -252,12 +271,37 @@ class InferenceContext {
   Status Add(const Dimension* first, DimensionOrConstant second,
              const Dimension** out);
 
+  // Returns in <out> the dimension that is <first> minus <second>.
+  Status Subtract(const Dimension* first, DimensionOrConstant second,
+                  const Dimension** out);
+
   // Returns in <out> the product of <first> and <second>.
   Status Multiply(const Dimension* first, DimensionOrConstant second,
                   const Dimension** out);
 
+  // Returns in <out> the minimum of <first> and <second>. If either <first> or
+  // <second> is zero the results is zero. Otherwise, if either <first> or
+  // <second> is unknown the results is unknown.
+  Status Min(const Dimension* first, DimensionOrConstant second,
+             const Dimension** out);
+
+  // Returns in <out> the maximum of <first> and <second>. If either <first> or
+  // <second> is unknown the results is unknown.
+  Status Max(const Dimension* first, DimensionOrConstant second,
+             const Dimension** out);
+
   Status construction_status() const { return construction_status_; }
 
+  // Validates that 'dim' has a known value, and prints an error
+  // message containing 'name' if validation fails.
+  Status ValidateKnownDim(const Dimension* dim, const char* name) {
+    if (!ValueKnown(dim)) {
+      return errors::InvalidArgument("Cannot infer shape because dimension ",
+                                     name, " is not known.");
+    }
+    return Status::OK();
+  }
+
  private:
   const Dimension* GetDimension(const DimensionOrConstant& d);
 
@@ -294,12 +338,30 @@ class InferenceContext {
 // Template and inline method implementations, please ignore
 
 inline Dimension::Dimension() : value_(InferenceContext::kUnknownDim) {}
-inline Dimension::Dimension(int64 value) : value_(value) {}
+inline Dimension::Dimension(int64 value) : value_(value) {
+  DCHECK(value >= 0 || value == InferenceContext::kUnknownDim)
+      << "Dimension must be non-negative or equal to "
+         "InferenceContext::kUnknownDim but got"
+      << value;
+}
 
 inline Shape::Shape() : rank_(InferenceContext::kUnknownRank) {}
-inline Shape::Shape(const std::vector<const Dimension*> dims)
+inline Shape::Shape(const std::vector<const Dimension*>& dims)
     : rank_(dims.size()), dims_(dims) {}
 
+inline DimensionOrConstant::DimensionOrConstant(const Dimension* dim)
+    : dim(dim) {
+  DCHECK(dim != nullptr) << "Internal error: Got nullptr for Dimension.";
+}
+
+inline DimensionOrConstant::DimensionOrConstant(int64 val)
+    : dim(nullptr), val(val) {
+  DCHECK(val >= 0 || val == InferenceContext::kUnknownDim)
+      << "Dimension must be non-negative or equal to "
+         "InferenceContext::kUnknownDim but got"
+      << val;
+}
+
 template <class T>
 Status InferenceContext::GetAttr(StringPiece attr_name, T* value) const {
   return GetNodeAttr(node_def_, attr_name, value);
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index a1557912c70..1ecba2839a7 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -36,6 +36,19 @@ static OpDef MakeOpDef(int num_inputs, int num_outputs) {
   return op_reg_data.op_def;
 }
 
+TEST(ShapeInferenceTest, DimensionOrConstant) {
+  NodeDef def;
+  InferenceContext c(&def, MakeOpDef(1, 1), {"?"}, {});
+  EXPECT_EQ(InferenceContext::kUnknownDim,
+            c.Value(InferenceContext::kUnknownDim));
+  EXPECT_EQ(1, c.Value(1));
+
+#ifndef NDEBUG
+  // Only run death test if DCHECKS are enabled.
+  EXPECT_DEATH(c.Value(-7), "Dimension must be non\\-negative or equal to");
+#endif
+}
+
 TEST(ShapeInferenceTest, RankAndDimInspection) {
   NodeDef def;
   InferenceContext c(&def, MakeOpDef(3, 2), {"?", "[1,?,3]", "[]"}, {});
@@ -767,15 +780,20 @@ TEST(ShapeInferenceTest, Divide) {
 
   EXPECT_EQ("Dimension size must be divisible by 5 but is 6",
             c.Divide(d_6, 5, &out).error_message());
+  EXPECT_EQ("Divisor must be positive but is 0",
+            c.Divide(d_6, 0, &out).error_message());
+  EXPECT_EQ("Divisor must be positive but is -1",
+            c.Divide(d_6, -1, &out).error_message());
 }
 
 TEST(ShapeInferenceTest, Add) {
   NodeDef def;
-  InferenceContext c(&def, MakeOpDef(1, 2), {"[6,?]"}, {});
+  InferenceContext c(&def, MakeOpDef(1, 2), {"[6,?,0]"}, {});
 
   auto s = c.input(0);
   auto d_6 = c.Dim(s, 0);
   auto d_unknown = c.Dim(s, 1);
+  auto d_0 = c.Dim(s, 2);
 
   // Adding non-zero to unknown gives new unknown.
   const Dimension* out;
@@ -790,16 +808,14 @@ TEST(ShapeInferenceTest, Add) {
   EXPECT_TRUE(out == d_6);
 
   // Adding dimension with value 0 to anything gives input.
-  EXPECT_TRUE(c.Add(d_unknown, c.MakeDim(0), &out).ok());
+  EXPECT_TRUE(c.Add(d_unknown, c.MakeDim(0ll), &out).ok());
   EXPECT_TRUE(out == d_unknown);
-  EXPECT_TRUE(c.Add(d_6, c.MakeDim(0), &out).ok());
+  EXPECT_TRUE(c.Add(d_6, c.MakeDim(0ll), &out).ok());
   EXPECT_TRUE(out == d_6);
 
   // Test addition.
   EXPECT_TRUE(c.Add(d_6, 2, &out).ok());
   EXPECT_EQ("8", c.DebugString(out));
-  EXPECT_TRUE(c.Add(d_6, -6, &out).ok());
-  EXPECT_EQ("0", c.DebugString(out));
   EXPECT_TRUE(c.Add(d_6, std::numeric_limits<int64>::max() - 6, &out).ok());
   EXPECT_EQ(std::numeric_limits<int64>::max(), c.Value(out));
 
@@ -811,14 +827,62 @@ TEST(ShapeInferenceTest, Add) {
   EXPECT_EQ(std::numeric_limits<int64>::max(), c.Value(out));
   EXPECT_TRUE(c.Add(d_6, c.UnknownDim(), &out).ok());
   EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(c.Add(d_0, d_6, &out).ok());
+  EXPECT_TRUE(out == d_6);
 
-  EXPECT_EQ("Negative dimension size from adding 6 and -7",
-            c.Add(d_6, -7, &out).error_message());
   EXPECT_EQ(
       "Dimension size overflow from adding 6 and 9223372036854775802",
       c.Add(d_6, std::numeric_limits<int64>::max() - 5, &out).error_message());
 }
 
+TEST(ShapeInferenceTest, Subtract) {
+  NodeDef def;
+  InferenceContext c(&def, MakeOpDef(1, 2), {"[6,?,0,5]"}, {});
+
+  auto s = c.input(0);
+  auto d_6 = c.Dim(s, 0);
+  auto d_unknown = c.Dim(s, 1);
+  auto d_0 = c.Dim(s, 2);
+  auto d_5 = c.Dim(s, 3);
+
+  // Subtracting non-zero from unknown gives new unknown.
+  const Dimension* out;
+  EXPECT_TRUE(c.Subtract(d_unknown, 1, &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(out != d_unknown);
+
+  // Subtracting 0 from anything gives input.
+  EXPECT_TRUE(c.Subtract(d_unknown, 0ll, &out).ok());
+  EXPECT_TRUE(out == d_unknown);
+  EXPECT_TRUE(c.Subtract(d_6, 0ll, &out).ok());
+  EXPECT_TRUE(out == d_6);
+
+  // Subtracting dimension with value 0 from anything gives input.
+  EXPECT_TRUE(c.Subtract(d_unknown, c.MakeDim(0ll), &out).ok());
+  EXPECT_TRUE(out == d_unknown);
+  EXPECT_TRUE(c.Subtract(d_6, c.MakeDim(0ll), &out).ok());
+  EXPECT_TRUE(out == d_6);
+
+  // Test subtraction.
+  EXPECT_TRUE(c.Subtract(d_6, 2, &out).ok());
+  EXPECT_EQ("4", c.DebugString(out));
+  EXPECT_TRUE(c.Subtract(d_6, 6, &out).ok());
+  EXPECT_EQ("0", c.DebugString(out));
+
+  // Test subtraction using dimension as second value.
+  EXPECT_TRUE(c.Subtract(d_6, c.MakeDim(2), &out).ok());
+  EXPECT_EQ("4", c.DebugString(out));
+  EXPECT_TRUE(c.Subtract(d_6, d_5, &out).ok());
+  EXPECT_EQ("1", c.DebugString(out));
+  EXPECT_TRUE(c.Subtract(d_6, c.UnknownDim(), &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(c.Subtract(d_6, d_0, &out).ok());
+  EXPECT_TRUE(out == d_6);
+
+  EXPECT_EQ("Negative dimension size caused by subtracting 6 from 5",
+            c.Subtract(d_5, d_6, &out).error_message());
+}
+
 TEST(ShapeInferenceTest, Multiply) {
   NodeDef def;
   InferenceContext c(&def, MakeOpDef(1, 2), {"[6,?,0,1]"}, {});
@@ -831,7 +895,7 @@ TEST(ShapeInferenceTest, Multiply) {
 
   // Multiplying non-zero to unknown gives new unknown.
   const Dimension* out;
-  EXPECT_TRUE(c.Multiply(d_unknown, 1, &out).ok());
+  EXPECT_TRUE(c.Multiply(d_unknown, 2, &out).ok());
   EXPECT_EQ("?", c.DebugString(out));
 
   // Multiplying 0 to anything gives 0.
@@ -844,19 +908,19 @@ TEST(ShapeInferenceTest, Multiply) {
 
   // Multiplying 1 to anything gives the original.
   // (unknown -> unknown)
-  EXPECT_TRUE(c.Multiply(d_unknown, static_cast<int64>(1), &out).ok());
-  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(c.Multiply(d_unknown, 1, &out).ok());
+  EXPECT_EQ(d_unknown, out);
   EXPECT_TRUE(c.Multiply(d_unknown, d_1, &out).ok());
-  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_EQ(d_unknown, out);
   EXPECT_TRUE(c.Multiply(d_1, d_unknown, &out).ok());
-  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_EQ(d_unknown, out);
   // (known -> known)
-  EXPECT_TRUE(c.Multiply(d_6, static_cast<int64>(1), &out).ok());
-  EXPECT_EQ("6", c.DebugString(out));
+  EXPECT_TRUE(c.Multiply(d_6, 1, &out).ok());
+  EXPECT_EQ(d_6, out);
   EXPECT_TRUE(c.Multiply(d_6, d_1, &out).ok());
-  EXPECT_EQ("6", c.DebugString(out));
+  EXPECT_EQ(d_6, out);
   EXPECT_TRUE(c.Multiply(d_1, d_6, &out).ok());
-  EXPECT_EQ("6", c.DebugString(out));
+  EXPECT_EQ(d_6, out);
 
   // Test multiplication.
   EXPECT_TRUE(c.Multiply(d_6, 2, &out).ok());
@@ -869,9 +933,112 @@ TEST(ShapeInferenceTest, Multiply) {
   EXPECT_EQ("12", c.DebugString(out));
   EXPECT_TRUE(c.Multiply(d_6, c.UnknownDim(), &out).ok());
   EXPECT_EQ("?", c.DebugString(out));
+}
 
-  EXPECT_EQ("Negative dimension size from multiplying 6 and -7",
-            c.Multiply(d_6, -7, &out).error_message());
+TEST(ShapeInferenceTest, FullyDefined) {
+  NodeDef def;
+  InferenceContext c(&def, MakeOpDef(0, 2), {}, {});
+
+  // No rank or missing dimension information should return false.
+  EXPECT_FALSE(c.FullyDefined(c.UnknownShape()));
+  EXPECT_FALSE(c.FullyDefined(c.Matrix(c.MakeDim(1), c.UnknownDim())));
+
+  // Return true if all information exists.
+  EXPECT_TRUE(c.FullyDefined(c.Matrix(c.MakeDim(1), c.MakeDim(2))));
+  EXPECT_TRUE(c.FullyDefined(c.Scalar()));
+}
+
+TEST(ShapeInferenceTest, ValidateKnownDim) {
+  NodeDef def;
+  InferenceContext c(&def, MakeOpDef(0, 2), {}, {});
+
+  EXPECT_FALSE(c.ValidateKnownDim(c.UnknownDim(), "unknown").ok());
+  EXPECT_TRUE(c.ValidateKnownDim(c.Dim(c.Matrix(1, 2), 0), "known").ok());
+}
+
+TEST(ShapeInferenceTest, Min) {
+  NodeDef def;
+  InferenceContext c(&def, MakeOpDef(1, 2), {"[1,2,?,0]"}, {});
+
+  auto s = c.input(0);
+  auto d_1 = c.Dim(s, 0);
+  auto d_2 = c.Dim(s, 1);
+  auto d_unknown = c.Dim(s, 2);
+  auto d_0 = c.Dim(s, 3);
+
+  // Minimum involving zero and unknown returns zero.
+  const Dimension* out;
+  EXPECT_TRUE(c.Min(d_0, d_unknown, &out).ok());
+  EXPECT_EQ(d_0, out);
+  EXPECT_TRUE(c.Min(d_unknown, d_0, &out).ok());
+  EXPECT_EQ(d_0, out);
+  EXPECT_TRUE(c.Min(c.MakeDim(0ll), d_unknown, &out).ok());
+  EXPECT_EQ("0", c.DebugString(out));
+  EXPECT_TRUE(c.Min(d_unknown, 0ll, &out).ok());
+  EXPECT_EQ("0", c.DebugString(out));
+
+  // Minimum involving unknowns and non-zeros gives new unknown.
+  EXPECT_TRUE(c.Min(d_unknown, d_unknown, &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(c.Min(d_unknown, 1, &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(c.Min(d_1, d_unknown, &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+
+  // Minimum with constant second arg.
+  EXPECT_TRUE(c.Min(d_1, 1, &out).ok());
+  EXPECT_EQ(d_1, out);
+  EXPECT_TRUE(c.Min(d_1, 3, &out).ok());
+  EXPECT_EQ(d_1, out);
+  EXPECT_TRUE(c.Min(d_2, 1, &out).ok());
+  EXPECT_EQ("1", c.DebugString(out));
+
+  // Minimum with two dimensions.
+  EXPECT_TRUE(c.Min(d_1, d_1, &out).ok());
+  EXPECT_EQ(d_1, out);
+  EXPECT_TRUE(c.Min(d_1, d_2, &out).ok());
+  EXPECT_EQ(d_1, out);
+  EXPECT_TRUE(c.Min(d_2, d_1, &out).ok());
+  EXPECT_EQ(d_1, out);
+  EXPECT_TRUE(c.Min(d_2, d_2, &out).ok());
+  EXPECT_EQ(d_2, out);
+}
+
+TEST(ShapeInferenceTest, Max) {
+  NodeDef def;
+  InferenceContext c(&def, MakeOpDef(1, 2), {"[1,2,?]"}, {});
+
+  auto s = c.input(0);
+  auto d_1 = c.Dim(s, 0);
+  auto d_2 = c.Dim(s, 1);
+  auto d_unknown = c.Dim(s, 2);
+
+  // Maximum involving unknowns gives new unknown.
+  const Dimension* out;
+  EXPECT_TRUE(c.Max(d_unknown, d_unknown, &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(c.Max(d_unknown, 1, &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+  EXPECT_TRUE(c.Max(d_1, d_unknown, &out).ok());
+  EXPECT_EQ("?", c.DebugString(out));
+
+  // Maximum with constant second arg.
+  EXPECT_TRUE(c.Max(d_1, 1, &out).ok());
+  EXPECT_EQ(d_1, out);
+  EXPECT_TRUE(c.Max(d_2, 1, &out).ok());
+  EXPECT_EQ(d_2, out);
+  EXPECT_TRUE(c.Max(d_2, 3, &out).ok());
+  EXPECT_EQ("3", c.DebugString(out));
+
+  // Maximum with two dimensions.
+  EXPECT_TRUE(c.Max(d_1, d_1, &out).ok());
+  EXPECT_EQ(d_1, out);
+  EXPECT_TRUE(c.Max(d_1, d_2, &out).ok());
+  EXPECT_EQ(d_2, out);
+  EXPECT_TRUE(c.Max(d_2, d_1, &out).ok());
+  EXPECT_EQ(d_2, out);
+  EXPECT_TRUE(c.Max(d_2, d_2, &out).ok());
+  EXPECT_EQ(d_2, out);
 }
 
 }  // namespace shape_inference
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index c1e55d032d6..60a9cb101fd 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -40,6 +40,11 @@ Status InferShapes(ShapeInferenceTestOp op, const string& ins,
   shape_inference::InferenceContext c(&op.node_def, op_reg_data->op_def, ins_v,
                                       op.input_tensors);
   TF_RETURN_IF_ERROR(c.construction_status());
+  if (op_reg_data->shape_inference_fn == nullptr) {
+    return errors::InvalidArgument(
+        "No shape inference function exists for op '", op.name,
+        "', did you forget to define it?");
+  }
   TF_RETURN_IF_ERROR(op_reg_data->shape_inference_fn(&c));
   const int num_outputs = c.num_outputs();
 
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 9f61d3d47e6..4e1a99acd68 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -33,13 +33,14 @@ static void AppendTo(const TensorShape& s, gtl::InlinedVector<int64, 8>* vals) {
 }
 
 void TensorShape::CheckDimsEqual(int NDIMS) const {
-  CHECK_EQ(NDIMS, dims()) << "Asking for tensor of " << NDIMS
-                          << " for a tensor of " << dims() << " dimensions";
+  CHECK_EQ(NDIMS, dims()) << "Asking for tensor of " << NDIMS << "dimensions"
+                          << " from a tensor of " << dims() << " dimensions";
 }
 
 void TensorShape::CheckDimsAtLeast(int NDIMS) const {
   CHECK_GE(NDIMS, dims()) << "Asking for tensor of at least " << NDIMS
-                          << " for a tensor of " << dims() << " dimensions";
+                          << " dimensions from a tensor of " << dims()
+                          << " dimensions";
 }
 
 bool TensorShape::IsValid(const TensorShapeProto& proto) {
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index 16fe299b27c..f02927456f8 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -259,6 +259,10 @@ class TensorShapeUtils {
 
   static bool IsMatrix(const TensorShape& shape) { return shape.dims() == 2; }
 
+  static bool IsSquareMatrix(const TensorShape& shape) {
+    return shape.dims() == 2 && shape.dim_size(0) == shape.dim_size(1);
+  }
+
   static bool IsMatrixOrHigher(const TensorShape& shape) {
     return shape.dims() >= 2;
   }
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 7cf25ba48f4..7098bed572f 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -935,13 +935,15 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     ref_recvs.clear();
     ref_control_inputs.clear();
     const Edge* control_flow_edge = nullptr;
+    int32 num_control_flow_edges = 0;
     for (const Edge* edge : dst->in_edges()) {
       if (edge->IsControlEdge()) {
         if (IsMerge(edge->src()) && IsControlLoop(edge->src())) {
           // This is one of the control edges added for control flow. There
           // can be multiple such edges as the dest node may have multiple
-          // remote inputs. We will just take one and ignore the others.
+          // remote inputs. We keep track of the number of such edges.
           control_flow_edge = edge;
+          ++num_control_flow_edges;
         } else {
           inputs.push_back(edge);
         }
@@ -953,7 +955,6 @@ Status Partition(const PartitionOptions& opts, Graph* g,
 
     // Process in order so that all data edges are added as inputs to
     // dst in Edge::dst_input() order.
-    bool recv_added = false;
     for (const Edge* edge : inputs) {
       const Node* src = edge->src();
       if (!src->IsOp()) continue;  // Skip Sink/Source nodes.
@@ -1041,21 +1042,21 @@ Status Partition(const PartitionOptions& opts, Graph* g,
           AddRecv(opts, g_info, dst_graph, edge, &real_recv, &status);
       if (!status.ok()) return status;
 
-      // Fix up the control flow edge. Redirect it to the recv.
+      // Fix up the control flow edge.
       // NOTE(yuanbyu): 'real_recv' must be the real recv node.
-      recv_added = true;
-      if (control_flow_edge != nullptr) {
+      if (src_graph == dst_graph) {
+        // For same device send/recv, add a control edge from send to recv.
+        // This prevents the asynchronous recv kernel from being scheduled
+        // before the data is available.
+        AddInput(real_recv, send->name(), Graph::kControlSlot);
+      } else if (control_flow_edge != nullptr) {
+        // Redirect control edge to the real recv since this is not a same
+        // device send/recv.
+        --num_control_flow_edges;
         AddInput(real_recv, control_flow_edge->src()->name(),
                  Graph::kControlSlot);
       }
 
-      // For same device send/recv, add a control edge from send to recv.
-      // This prevents the asynchronous recv kernel from being scheduled
-      // immediately.
-      if (src_graph == dst_graph) {
-        AddInput(real_recv, send->name(), Graph::kControlSlot);
-      }
-
       if (!edge->IsControlEdge() &&
           IsRefType(src->output_type(edge->src_output()))) {
         AddNodeAttr("_start_time", recv_start_time, recv);
@@ -1092,9 +1093,12 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     // execution of recvs until all the other inputs become available.
     AddReadControl(ref_recvs, ref_control_inputs);
 
-    // Add back this control edge for control flow if not used.
-    if (!recv_added && (control_flow_edge != nullptr)) {
-      AddInput(dst_def, control_flow_edge->src()->name(), Graph::kControlSlot);
+    // Add back the control edges for control flow that are not used.
+    if (control_flow_edge != nullptr) {
+      for (int i = 0; i < num_control_flow_edges; ++i) {
+        AddInput(dst_def, control_flow_edge->src()->name(),
+                 Graph::kControlSlot);
+      }
     }
   }
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index d52f3895180..3c2dab98b39 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -379,8 +379,8 @@ tf_kernel_libraries(
         "batch_matrix_diag_op",
         "batch_matrix_set_diag_op",
         "edit_distance_op",
-        "gather_nd_op",
         "gather_op",
+        "gather_nd_op",
         "identity_op",
         "immutable_constant_op",
         "listdiff_op",
@@ -423,6 +423,7 @@ tf_kernel_libraries(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/debug:debug_io_utils",
         "//third_party/eigen3",
     ],
 )
@@ -1018,13 +1019,14 @@ tf_kernel_libraries(
         "cholesky_grad",
         "determinant_op",
         "self_adjoint_eig_op",
+        "self_adjoint_eig_v2_op",
         "matrix_inverse_op",
         "matrix_solve_ls_op",
         "matrix_solve_op",
         "matrix_triangular_solve_op",
+        "svd_op",
     ],
     deps = [
-        ":binary_linalg_ops_common",
         ":linalg_ops_common",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -1033,7 +1035,6 @@ tf_kernel_libraries(
     ],
 )
 
-# TODO(josh11b): Should these two *linalg_ops_common libraries be merged?
 cc_library(
     name = "linalg_ops_common",
     srcs = ["linalg_ops_common.cc"],
@@ -1041,17 +1042,7 @@ cc_library(
     visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/core:framework",
-        "//third_party/eigen3",
-    ],
-)
-
-cc_library(
-    name = "binary_linalg_ops_common",
-    srcs = ["binary_linalg_ops_common.cc"],
-    hdrs = ["binary_linalg_ops_common.h"],
-    visibility = ["//visibility:private"],
-    deps = [
-        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//third_party/eigen3",
     ],
 )
diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc
index 706ecaaee0a..b7851f9ff67 100644
--- a/tensorflow/core/kernels/bcast_ops.cc
+++ b/tensorflow/core/kernels/bcast_ops.cc
@@ -58,6 +58,8 @@ class BCastGradArgsOp : public OpKernel {
     Output(ctx, 1, bcast.grad_y_reduce_idx());
   }
 
+  bool IsExpensive() override { return false; }
+
  private:
   void Output(OpKernelContext* ctx, int idx, const BCast::Vec& v) {
     const int64 len = v.size();
diff --git a/tensorflow/core/kernels/binary_linalg_ops_common.cc b/tensorflow/core/kernels/binary_linalg_ops_common.cc
deleted file mode 100644
index 17dcd843524..00000000000
--- a/tensorflow/core/kernels/binary_linalg_ops_common.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/binary_linalg_ops_common.h"
-
-namespace tensorflow {
-
-void BinaryLinearAlgebraOpBase::Compute(OpKernelContext* context) {
-  const Tensor& in_lhs = context->input(0);
-  const Tensor& in_rhs = context->input(1);
-
-  const int input_rank = in_lhs.dims();
-  if (SupportsBatchOperation()) {
-    OP_REQUIRES(context, input_rank >= 2,
-                errors::InvalidArgument("Input tensor must have rank >= 2"));
-  } else {
-    OP_REQUIRES(context, input_rank == 2,
-                errors::InvalidArgument("Input tensor must have rank == 2"));
-  }
-  // TODO(rmlarsen): Add support for broadcasting.
-  OP_REQUIRES(
-      context, input_rank == in_rhs.dims(),
-      errors::InvalidArgument(
-          "Tensors must have the same rank: rank(lhs) (%d) != rank(rhs) (%d)",
-          input_rank, in_rhs.dims()));
-  for (int dim = 0; dim < (in_rhs.dims() - 2); ++dim) {
-    OP_REQUIRES(context, in_rhs.dim_size(dim) == in_lhs.dim_size(dim),
-                errors::InvalidArgument(
-                    "Dimension mismatch: %d != %d for dimension %d",
-                    in_lhs.dim_size(dim), in_rhs.dim_size(dim), dim));
-  }
-
-  // If the tensor rank is greater than 2, we consider the inner-most
-  // dimensions as matrices, and loop over all the other outer
-  // dimensions to compute the results.
-  const int row_dimension = input_rank - 2;
-  const int col_dimension = input_rank - 1;
-  const int64 lhs_num_rows = in_lhs.dim_size(row_dimension);
-  const int64 lhs_num_cols = in_lhs.dim_size(col_dimension);
-  const int64 rhs_num_rows = in_rhs.dim_size(row_dimension);
-  const int64 rhs_num_cols = in_rhs.dim_size(col_dimension);
-  const TensorShape in_lhs_matrix_shape =
-      TensorShape({lhs_num_rows, lhs_num_cols});
-  const TensorShape in_rhs_matrix_shape =
-      TensorShape({rhs_num_rows, rhs_num_cols});
-  const TensorShape output_matrix_shape =
-      GetOutputMatrixShape(in_lhs_matrix_shape, in_rhs_matrix_shape);
-  OP_REQUIRES(context, output_matrix_shape.dims() <= 2,
-              errors::InvalidArgument("Output rank must be 1 or 2."));
-
-  int num_matrices = 1;
-  // The output has the shape of all the outer dimensions of the input
-  // except for the last two, plus the output_matrix_shape (if the output
-  // is not scalar). This still assumes that each input matrix is
-  // 2-dimensional, in accordance with the TODO above.
-  TensorShape output_shape;
-  if (input_rank == 2) {
-    output_shape = output_matrix_shape;
-  } else {
-    // Add the common outer dimensions.
-    for (int dim = 0; dim < input_rank - 2; ++dim) {
-      num_matrices *= in_lhs.dim_size(dim);
-      output_shape.AddDim(in_lhs.dim_size(dim));
-    }
-    // Add the inner dimensions that depend on the operation implemented by the
-    // derived class.
-    for (int dim = 0; dim < output_matrix_shape.dims(); ++dim) {
-      output_shape.AddDim(output_matrix_shape.dim_size(dim));
-    }
-  }
-
-  Tensor* out = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &out));
-
-  auto shard = [this, &in_lhs, &in_lhs_matrix_shape, &in_rhs,
-                &in_rhs_matrix_shape, &output_matrix_shape, context,
-                out](int64 begin, int64 end) {
-    for (int64 i = begin; i < end; ++i) {
-      ComputeMatrix(context, i, in_lhs, in_lhs_matrix_shape, in_rhs,
-                    in_rhs_matrix_shape, out, output_matrix_shape);
-    }
-  };
-
-  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-  Shard(worker_threads.num_threads, worker_threads.workers, num_matrices,
-        GetCostPerUnit(in_lhs_matrix_shape, in_rhs_matrix_shape), shard);
-}
-
-template <typename Scalar, bool SupportsBatchOperationT>
-void BinaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>::ComputeMatrix(
-    OpKernelContext* context, int64 matrix_index, const Tensor& in_lhs,
-    const TensorShape& in_lhs_matrix_shape, const Tensor& in_rhs,
-    const TensorShape& in_rhs_matrix_shape, Tensor* out,
-    const TensorShape& output_matrix_shape) {
-  // TODO(kalakris): Handle alignment if possible. Eigen::Map is
-  // unaligned by default.
-  ConstMatrixMap in_lhs_map(
-      in_lhs.flat<Scalar>().data() +
-          matrix_index * in_lhs_matrix_shape.num_elements(),
-      in_lhs_matrix_shape.dim_size(0), in_lhs_matrix_shape.dim_size(1));
-  ConstMatrixMap in_rhs_map(
-      in_rhs.flat<Scalar>().data() +
-          matrix_index * in_rhs_matrix_shape.num_elements(),
-      in_rhs_matrix_shape.dim_size(0), in_rhs_matrix_shape.dim_size(1));
-
-  // The output matrix shape may not be a matrix.
-  int num_output_rows =
-      output_matrix_shape.dims() >= 1 ? output_matrix_shape.dim_size(0) : 1;
-  int num_output_cols =
-      output_matrix_shape.dims() == 2 ? output_matrix_shape.dim_size(1) : 1;
-  MatrixMap output(out->flat<Scalar>().data() +
-                       matrix_index * output_matrix_shape.num_elements(),
-                   num_output_rows, num_output_cols);
-  ComputeMatrix(context, in_lhs_map, in_rhs_map, &output);
-}
-
-// Explicitly instantiate BinaryLinearAlgebraOp for the scalar types we expect
-// to use.
-template class BinaryLinearAlgebraOp<float, false>;
-template class BinaryLinearAlgebraOp<float, true>;
-template class BinaryLinearAlgebraOp<double, false>;
-template class BinaryLinearAlgebraOp<double, true>;
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/binary_linalg_ops_common.h b/tensorflow/core/kernels/binary_linalg_ops_common.h
deleted file mode 100644
index 9bbab797a63..00000000000
--- a/tensorflow/core/kernels/binary_linalg_ops_common.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_KERNELS_KERNELS_BINARY_LINALG_OPS_COMMON_H_
-#define TENSORFLOW_KERNELS_KERNELS_BINARY_LINALG_OPS_COMMON_H_
-
-// Classes to support binary linear algebra operations. This should eventually
-// be merged into third_party/tensorflow/core/kernels/linalg_ops_common.h.
-
-#define EIGEN_USE_THREADS
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/kernel_def_builder.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/work_sharder.h"
-
-namespace tensorflow {
-
-// Base class for binary linear algebra operators.
-class BinaryLinearAlgebraOpBase : public OpKernel {
- public:
-  explicit BinaryLinearAlgebraOpBase(OpKernelConstruction* context)
-      : OpKernel(context) {}
-  ~BinaryLinearAlgebraOpBase() override {}
-
-  // Return the output shape of each individual matrix operation. Must be
-  // rank 0, 1, or 2.  Scalar outputs are rank 0.
-  virtual TensorShape GetOutputMatrixShape(
-      const TensorShape& in_lhs_matrix_shape,
-      const TensorShape& in_rhs_matrix_shape) = 0;
-
-  // Return the cost per matrix operation. Cost per unit is assumed to be
-  // roughly 1ns, based on comments in core/util/work_sharder.cc.
-  virtual int64 GetCostPerUnit(const TensorShape& in_lhs_matrix_shape,
-                               const TensorShape& in_rhs_matrix_shape) = 0;
-
-  // If SupportsBatchOperation() returns false, this Op will only accept rank 2
-  // (if the supported input type is a matrix). If it returns true, the Op will
-  // accept inputs of rank >= 3, and repeatedly execute the operation on all
-  // matrices in the innermost two dimensions.
-  virtual bool SupportsBatchOperation() = 0;
-
-  // Perform the actual computation on an input matrix, and store the results
-  // in the output. This will be called repeatedly for a single call to
-  // Compute(), if multiple matrices exist in the input Tensor.
-  //
-  // This function should only compute the results for a single input matrix.
-  // The 'matrix_index' parameter specifies the index of the matrix to be used
-  // from the input, and the index of the matrix to be written to in the output.
-  // The two input matrices are in row major order, and located at the memory
-  // addresses
-  //   a_in.flat<Scalar>().data() +
-  //   matrix_index * a_in_matrix_shape.num_elements(), and
-  //   b_in.flat<Scalar>().data() +
-  //   matrix_index * b_in_matrix_shape.num_elements().
-  // The output matrix is in row major order, and is located at the memory
-  // address
-  //   out->flat<Scalar>().data() +
-  //   matrix_index * output_matrix_shape.num_elements().
-  // The BinaryLinearAlgebraOp<Scalar> class below has functionality which
-  // performs
-  // this mapping and presents an interface based on the Eigen::MatrixBase API.
-  virtual void ComputeMatrix(OpKernelContext* context, int64 matrix_index,
-                             const Tensor& a_in,
-                             const TensorShape& a_in_matrix_shape,
-                             const Tensor& b_in,
-                             const TensorShape& b_in_matrix_shape,
-                             Tensor* output,
-                             const TensorShape& output_matrix_shape) = 0;
-  void Compute(OpKernelContext* context) override;
-};
-
-// This base class encapsulates the functionality of mapping the input and
-// output tensors using Eigen::Map, so that the Eigen::MatrixBase API may be
-// directly used by derived classes.
-// SupportsBatchOperationT is a bool template argument which if set to true
-// will allow the Op to process batches of matrices (rank >= 3); if set to
-// false the Op will only accept rank 2 inputs.
-template <typename Scalar, bool SupportsBatchOperationT>
-class BinaryLinearAlgebraOp : public BinaryLinearAlgebraOpBase {
- public:
-  explicit BinaryLinearAlgebraOp(OpKernelConstruction* context)
-      : BinaryLinearAlgebraOpBase(context) {}
-
-  using Matrix =
-      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using ConstMatrixMap =
-      Eigen::Map<const Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic,
-                                     Eigen::RowMajor>>;
-  using MatrixMap = Eigen::Map<
-      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
-  // Perform the actual computation on the input matrix, and store the results
-  // in the output. This will be called repeatedly for a single call to
-  // Compute(), if multiple matrices exist in the input Tensor.
-  virtual void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& a,
-                             const ConstMatrixMap& b, MatrixMap* output) = 0;
-
-  bool SupportsBatchOperation() final { return SupportsBatchOperationT; }
-
-  // A concrete implementation of BinaryLinearAlgebraOpBase::ComputeMatrix().
-  void ComputeMatrix(OpKernelContext* context, int64 matrix_index,
-                     const Tensor& a_in, const TensorShape& a_in_matrix_shape,
-                     const Tensor& b_in, const TensorShape& b_in_matrix_shape,
-                     Tensor* output,
-                     const TensorShape& output_matrix_shape) final;
-};
-
-// Declare that BinaryLinearAlgebraOp is explicitly instantiated in
-// linalg_ops_common.cc for float and double.
-extern template class BinaryLinearAlgebraOp<float, false>;
-extern template class BinaryLinearAlgebraOp<float, true>;
-extern template class BinaryLinearAlgebraOp<double, false>;
-extern template class BinaryLinearAlgebraOp<double, true>;
-
-}  // namespace tensorflow
-
-#define REGISTER_BINARY_LINALG_OP(OpName, OpClass, Scalar) \
-  REGISTER_KERNEL_BUILDER(                                 \
-      Name(OpName).Device(DEVICE_CPU).TypeConstraint<Scalar>("T"), OpClass)
-
-#endif  // TENSORFLOW_KERNELS_KERNELS_BINARY_LINALG_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/cholesky_grad.cc b/tensorflow/core/kernels/cholesky_grad.cc
index 3c1a227215b..027897a4195 100644
--- a/tensorflow/core/kernels/cholesky_grad.cc
+++ b/tensorflow/core/kernels/cholesky_grad.cc
@@ -18,63 +18,56 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/binary_linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
 
 namespace tensorflow {
 
-template <typename Scalar, bool SupportsBatchOperationT>
-class CholeskyGrad
-    : public BinaryLinearAlgebraOp<Scalar, SupportsBatchOperationT> {
+template <typename Scalar, bool SupportsBatchOperation>
+class CholeskyGrad : public LinearAlgebraOp<Scalar, SupportsBatchOperation> {
  public:
-  explicit CholeskyGrad(OpKernelConstruction* context)
-      : BinaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {}
-  ~CholeskyGrad() override {}
+  typedef LinearAlgebraOp<Scalar, SupportsBatchOperation> Base;
 
-  using Matrix =
-      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using ConstMatrixMap = Eigen::Map<const Matrix>;
-  using MatrixMap = Eigen::Map<Matrix>;
+  explicit CholeskyGrad(OpKernelConstruction* context) : Base(context) {}
+
+  using TensorShapes = typename Base::TensorShapes;
+  using Matrix = typename Base::Matrix;
+  using MatrixMap = typename Base::MatrixMap;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
   using ConstRef = Eigen::Ref<const Matrix>;
   using Ref = Eigen::Ref<Matrix>;
 
-  TensorShape GetOutputMatrixShape(
-      const TensorShape& input_matrix_l_full_shape,
-      const TensorShape& input_matrix_grad_shape) override {
-    return input_matrix_l_full_shape;
-  }
-
-  int64 GetCostPerUnit(const TensorShape& input_matrix_shape,
-                       const TensorShape& rhs_matrix_shape) override {
-    const int64 rows = input_matrix_shape.dim_size(0);
-    if (rows > (1LL << 20)) {
-      // A big number to cap the cost in case overflow.
-      return kint64max;
-    } else {
-      return rows * rows * rows;
-    }
-  }
-
-  void ComputeMatrix(OpKernelContext* context,
-                     const ConstMatrixMap& input_matrix_l_full,
-                     const ConstMatrixMap& input_matrix_grad,
-                     MatrixMap* output_matrix) override {
+  void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    OP_REQUIRES(context, input_matrix_shapes.size() == 2,
+                errors::InvalidArgument("Expected two input matrices, got %d.",
+                                        input_matrix_shapes.size()));
+    OP_REQUIRES(context, input_matrix_shapes[0] == input_matrix_shapes[1],
+                errors::InvalidArgument(
+                    "Inputs (L and grad) must have the same shape."));
     OP_REQUIRES(context,
-                input_matrix_l_full.rows() == input_matrix_l_full.cols(),
-                errors::InvalidArgument("Input matrix must be square."));
-    OP_REQUIRES(
-        context, input_matrix_l_full.cols() == input_matrix_grad.cols(),
-        errors::InvalidArgument(
-            "Input matrix and gradient must have same number of cols."));
-    OP_REQUIRES(
-        context, input_matrix_l_full.rows() == input_matrix_grad.rows(),
-        errors::InvalidArgument(
-            "Input matrix and gradient must have same number of rows."));
+                TensorShapeUtils::IsSquareMatrix(input_matrix_shapes[0]),
+                errors::InvalidArgument("Inputs must be a square matrices."));
+  }
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    return TensorShapes({input_matrix_shapes[0]});
+  }
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const ConstMatrixMap& input_matrix_l_full = inputs[0];
+    const ConstMatrixMap& input_matrix_grad = inputs[1];
+    MatrixMap output_matrix = outputs->at(0);
 
     // Algorithm only depends on lower triangular half on input_matrix_l.
     const Matrix input_matrix_l =
         input_matrix_l_full.template triangularView<Eigen::Lower>();
     // Algorithm only depends on lower triangular half on input_matrix_grad.
-    *output_matrix = input_matrix_grad.template triangularView<Eigen::Lower>();
+    output_matrix = input_matrix_grad.template triangularView<Eigen::Lower>();
 
     const int64 kMatrixSize = input_matrix_l.rows();
     const int64 kMaxBlockSize = 32;
@@ -97,34 +90,35 @@ class CholeskyGrad
 
       auto B = input_matrix_l.block(block_end, 0, trailing_size, block_begin);
       auto B_bar =
-          output_matrix->block(block_end, 0, trailing_size, block_begin);
+          output_matrix.block(block_end, 0, trailing_size, block_begin);
 
       auto C = input_matrix_l.block(block_end, block_begin, trailing_size,
                                     block_size);
-      auto C_bar = output_matrix->block(block_end, block_begin, trailing_size,
-                                        block_size);
+      auto C_bar = output_matrix.block(block_end, block_begin, trailing_size,
+                                       block_size);
 
       auto D = input_matrix_l.block(block_begin, block_begin, block_size,
                                     block_size);
-      auto D_bar = output_matrix->block(block_begin, block_begin, block_size,
-                                        block_size);
+      auto D_bar =
+          output_matrix.block(block_begin, block_begin, block_size, block_size);
 
       auto R = input_matrix_l.block(block_begin, 0, block_size, block_begin);
-      auto R_bar =
-          output_matrix->block(block_begin, 0, block_size, block_begin);
+      auto R_bar = output_matrix.block(block_begin, 0, block_size, block_begin);
 
-      C_bar = D.adjoint().template triangularView<Eigen::Upper>()
-          .solve(C_bar.adjoint()).adjoint();
+      C_bar = D.adjoint()
+                  .template triangularView<Eigen::Upper>()
+                  .solve(C_bar.adjoint())
+                  .adjoint();
       D_bar -= (C_bar.adjoint() * C).template triangularView<Eigen::Lower>();
       B_bar -= C_bar * R;
       R_bar -= C_bar.adjoint() * B;
       CholeskyGradUnblocked(D, D_bar);
       R_bar -= (D_bar + D_bar.adjoint()) * R;
     }
-    *output_matrix =
-        (0.5 * (*output_matrix + output_matrix->transpose())).eval();
+    output_matrix = (0.5 * (output_matrix + output_matrix.transpose())).eval();
   }
 
+ private:
   void CholeskyGradUnblocked(const ConstRef& l_block, Ref grad_block) {
     const int64 kMatrixSize = l_block.rows();
     for (int64 k = kMatrixSize - 1; k >= 0; k--) {
@@ -162,11 +156,8 @@ class CholeskyGrad
   }
 };
 
-REGISTER_BINARY_LINALG_OP("CholeskyGrad", (CholeskyGrad<float, false>), float);
-REGISTER_BINARY_LINALG_OP("CholeskyGrad", (CholeskyGrad<double, false>),
-                          double);
-REGISTER_BINARY_LINALG_OP("BatchCholeskyGrad", (CholeskyGrad<float, true>),
-                          float);
-REGISTER_BINARY_LINALG_OP("BatchCholeskyGrad", (CholeskyGrad<double, true>),
-                          double);
+REGISTER_LINALG_OP("CholeskyGrad", (CholeskyGrad<float, false>), float);
+REGISTER_LINALG_OP("CholeskyGrad", (CholeskyGrad<double, false>), double);
+REGISTER_LINALG_OP("BatchCholeskyGrad", (CholeskyGrad<float, true>), float);
+REGISTER_LINALG_OP("BatchCholeskyGrad", (CholeskyGrad<double, true>), double);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cholesky_op.cc b/tensorflow/core/kernels/cholesky_op.cc
index e3a68f3e738..f124ce2cefc 100644
--- a/tensorflow/core/kernels/cholesky_op.cc
+++ b/tensorflow/core/kernels/cholesky_op.cc
@@ -17,9 +17,8 @@ limitations under the License.
 // TODO(konstantinos): Enable complex inputs. This will require additional tests
 //                     and OP_REQUIRES.
 
-#include <cmath>
-
 #include "third_party/eigen3/Eigen/Cholesky"
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -30,37 +29,21 @@ limitations under the License.
 
 namespace tensorflow {
 
-template <class Scalar, bool SupportsBatchOperationT>
-class CholeskyOp
-    : public UnaryLinearAlgebraOp<Scalar, SupportsBatchOperationT> {
+template <class Scalar, bool SupportsBatchOperation>
+class CholeskyOp : public LinearAlgebraOp<Scalar, SupportsBatchOperation> {
  public:
-  explicit CholeskyOp(OpKernelConstruction* context)
-      : UnaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {}
+  typedef LinearAlgebraOp<Scalar, SupportsBatchOperation> Base;
 
-  TensorShape GetOutputMatrixShape(
-      const TensorShape& input_matrix_shape) override {
-    return input_matrix_shape;
-  }
+  explicit CholeskyOp(OpKernelConstruction* context) : Base(context) {}
 
-  int64 GetCostPerUnit(const TensorShape& input_matrix_shape) override {
-    const int64 rows = input_matrix_shape.dim_size(0);
-    if (rows > (1LL << 20)) {
-      // A big number to cap the cost in case overflow.
-      return kint64max;
-    } else {
-      return rows * rows * rows;
-    }
-  }
+  using Matrix = typename Base::Matrix;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
 
-  using
-      typename UnaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>::MatrixMap;
-  using typename UnaryLinearAlgebraOp<Scalar,
-                                      SupportsBatchOperationT>::ConstMatrixMap;
-
-  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& input,
-                     MatrixMap* output) override {
-    OP_REQUIRES(context, input.rows() == input.cols(),
-                errors::InvalidArgument("Input matrix must be square."));
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const ConstMatrixMap& input = inputs[0];
     if (input.rows() == 0) {
       // If X is an empty matrix (0 rows, 0 col), X * X' == X.
       // Therefore, we return X.
@@ -74,7 +57,7 @@ class CholeskyOp
         llt_decomposition(input);
 
     // Output the lower triangular in a dense form.
-    *output = llt_decomposition.matrixL();
+    outputs->at(0) = llt_decomposition.matrixL();
 
     OP_REQUIRES(context, llt_decomposition.info() == Eigen::Success,
                 errors::InvalidArgument("LLT decomposition was not successful. "
diff --git a/tensorflow/core/kernels/cwise_op_lgamma.cc b/tensorflow/core/kernels/cwise_op_lgamma.cc
index 930a861eae2..b7fe4472dc2 100644
--- a/tensorflow/core/kernels/cwise_op_lgamma.cc
+++ b/tensorflow/core/kernels/cwise_op_lgamma.cc
@@ -16,8 +16,17 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "Lgamma", functor::lgamma, float, Eigen::half, double);
+
+template <typename Device, typename Functor>
+class LgammaOp : public UnaryOp<Device, Functor> {
+ public:
+  explicit LgammaOp(OpKernelConstruction* ctx) : UnaryOp<Device, Functor>(ctx) {
+    TF_ANNOTATE_BENIGN_RACE(&signgam, "signgam output from lgamma is unused");
+  }
+};
+
+REGISTER3(LgammaOp, CPU, "Lgamma", functor::lgamma, float, Eigen::half, double);
 #if GOOGLE_CUDA
-REGISTER3(UnaryOp, GPU, "Lgamma", functor::lgamma, float, Eigen::half, double);
+REGISTER3(LgammaOp, GPU, "Lgamma", functor::lgamma, float, Eigen::half, double);
 #endif
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index a0997c2d71b..823e7e14ed9 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -23,13 +23,14 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Creates a Graph which applies a unary "func" on a 3D float tensor
-// of "num" elements.
+// Creates a Graph which applies a unary "func" on a 3D tensor of
+// type T with "num" elements.
+template <typename T>
 static Graph* Unary(const string& func, int num, DataType dtype) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor data(dtype, TensorShape({64, 64, num / (64 * 64)}));
   CHECK_GT(data.NumElements(), 0);
-  data.flat<float>().setRandom();
+  data.flat<T>().setRandom();
   test::graph::Unary(g, func, test::graph::Constant(g, data), 0);
   return g;
 }
@@ -40,21 +41,23 @@ static int RowsAndColsArg(int r, int c) { return r * kRows + c; }
 static int RowsFromArg(int arg) { return (arg / kRows); }
 static int ColsFromArg(int arg) { return (arg % kRows); }
 
-#define BM_UNARY(DEVICE, FUNC, TYPE)                              \
-  static void BM_##DEVICE##_##FUNC_##TYPE(int iters, int num) {   \
-    const int64 tot = static_cast<int64>(iters) * num;            \
-    testing::ItemsProcessed(tot);                                 \
-    testing::BytesProcessed(tot * sizeof(float));                 \
-    test::Benchmark(#DEVICE, Unary(#FUNC, num, TYPE)).Run(iters); \
-  }                                                               \
-  BENCHMARK(BM_##DEVICE##_##FUNC_##TYPE)->Range(4 << 10, 1 << 20);
+#define BM_UNARY(DEVICE, FUNC, T, TYPE)                              \
+  static void BM_##DEVICE##_##FUNC##_##TYPE(int iters, int num) {    \
+    const int64 tot = static_cast<int64>(iters) * num;               \
+    testing::ItemsProcessed(tot);                                    \
+    testing::BytesProcessed(tot * sizeof(T));                        \
+    test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE)).Run(iters); \
+  }                                                                  \
+  BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20);
 
-BM_UNARY(cpu, Floor, DT_FLOAT);
-BM_UNARY(gpu, Floor, DT_FLOAT);
-BM_UNARY(cpu, Conj, DT_COMPLEX64);
-BM_UNARY(gpu, Conj, DT_COMPLEX64);
-BM_UNARY(cpu, Conj, DT_COMPLEX128);
-BM_UNARY(gpu, Conj, DT_COMPLEX128);
+BM_UNARY(cpu, Floor, float, DT_FLOAT);
+BM_UNARY(gpu, Floor, float, DT_FLOAT);
+BM_UNARY(cpu, Floor, double, DT_DOUBLE);
+BM_UNARY(gpu, Floor, double, DT_DOUBLE);
+BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
+BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
+BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
+BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
 
 // data func scalar.
 static Graph* BinaryScalar(int num, const string& func) {
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 3e46970812f..17c87ffab28 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_KERNELS_DEBUG_OP_H_
 
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_util.h"
@@ -46,6 +47,7 @@ class CopyOp : public OpKernel {
     Tensor* copied_tensor;
     OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(),
                                                      &copied_tensor));
+#if GOOGLE_CUDA
     if (off_host_input) {
       // Input is not on host: deep-copy it from GPU to the same GPU.
       Notification done_copy;
@@ -57,6 +59,9 @@ class CopyOp : public OpKernel {
       // The input tensor is on the host (CPU): deep-copy from CPU to CPU.
       *copied_tensor = tensor::DeepCopy(src_tensor);
     }
+#else
+    *copied_tensor = tensor::DeepCopy(src_tensor);
+#endif  // GOOGLE_CUDA
   }
 
   bool IsExpensive() override { return false; }
@@ -73,10 +78,16 @@ class DebugIdentityOp : public OpKernel {
  public:
   explicit DebugIdentityOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
-    // TODO(cais): Add debug_url
+    OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
   }
 
   void Compute(OpKernelContext* context) override {
+    if (!debug_urls_.empty()) {
+      DebugIO::PublishDebugTensor(tensor_name_, "DebugIdentity",
+                                  context->input(0),
+                                  Env::Default()->NowMicros(), debug_urls_);
+    }
+
     context->set_output(0, context->input(0));
   }
 
@@ -84,6 +95,7 @@ class DebugIdentityOp : public OpKernel {
 
  private:
   string tensor_name_;
+  std::vector<string> debug_urls_;
 };
 
 // NaN-counter op for debugging.
@@ -92,6 +104,7 @@ class DebugNanCountOp : public OpKernel {
  public:
   explicit DebugNanCountOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
+    OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -120,6 +133,7 @@ class DebugNanCountOp : public OpKernel {
 
  private:
   string tensor_name_;
+  std::vector<string> debug_urls_;
 };
 
 // TODO(cais): Add DebugInfinityCount
diff --git a/tensorflow/core/kernels/debug_ops_test.cc b/tensorflow/core/kernels/debug_ops_test.cc
index e584d43e22d..e526754d316 100644
--- a/tensorflow/core/kernels/debug_ops_test.cc
+++ b/tensorflow/core/kernels/debug_ops_test.cc
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <dirent.h>
+#include <string.h>
+#include <fstream>
+#include <vector>
+
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -22,20 +27,32 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
 namespace {
 
 class DebugIdentityOpTest : public OpsTestBase {
  protected:
-  Status Init(DataType input_type) {
+  Status Init(DataType input_type, const std::vector<string> debug_urls) {
+    env_ = Env::Default();
+
     TF_CHECK_OK(NodeDefBuilder("op", "DebugIdentity")
                     .Input(FakeInput(input_type))
                     .Attr("tensor_name", "FakeTensor:0")
+                    .Attr("debug_urls", debug_urls)
                     .Finalize(node_def()));
     return InitOp();
   }
+
+  Status Init(DataType input_type) {
+    std::vector<string> empty_debug_urls;
+    return Init(input_type, empty_debug_urls);
+  }
+
+  Env* env_;
 };
 
 TEST_F(DebugIdentityOpTest, Int32Success_6) {
@@ -48,6 +65,80 @@ TEST_F(DebugIdentityOpTest, Int32Success_6) {
   test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
 }
 
+TEST_F(DebugIdentityOpTest, Int32Success_6_FileURLs) {
+  const int kNumDumpDirs = 3;
+
+  const string tmp_dir = testing::TmpDir();
+
+  std::vector<string> dump_roots;
+  std::vector<string> debug_urls;
+  for (int i = 0; i < kNumDumpDirs; ++i) {
+    const string dump_root = strings::StrCat(tmp_dir, "_", i);
+    dump_roots.push_back(dump_root);
+
+    debug_urls.push_back(strings::StrCat("file://", dump_root));
+  }
+
+  uint64 wall_time = Env::Default()->NowMicros();
+
+  TF_ASSERT_OK(Init(DT_INT32, debug_urls));
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  // Verify the identity output
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+
+  for (int i = 0; i < kNumDumpDirs; ++i) {
+    ASSERT_TRUE(env_->FileExists(dump_roots[i]));
+    ASSERT_TRUE(env_->IsDirectory(dump_roots[i]).ok());
+
+    DIR* dir = opendir(dump_roots[i].c_str());
+    struct dirent* ent;
+    int dump_files_found = 0;
+    while ((ent = readdir(dir)) != NULL) {
+      if (strcmp(ent->d_name, ".") && strcmp(ent->d_name, "..")) {
+        dump_files_found++;
+
+        // Try reading the file into a Event proto.
+        const string dump_file_path =
+            strings::StrCat(dump_roots[i], "/", ent->d_name);
+        std::fstream ifs(dump_file_path, std::ios::in | std::ios::binary);
+        Event event;
+        event.ParseFromIstream(&ifs);
+        ifs.close();
+
+        ASSERT_GE(event.wall_time(), wall_time);
+        ASSERT_EQ(1, event.summary().value().size());
+        ASSERT_EQ(strings::StrCat("FakeTensor", ":", 0, ":", "DebugIdentity"),
+                  event.summary().value(0).node_name());
+
+        Tensor tensor_prime(DT_INT32);
+        ASSERT_TRUE(tensor_prime.FromProto(event.summary().value(0).tensor()));
+
+        // Verify tensor shape and value from the dump file.
+        ASSERT_EQ(TensorShape({6}), tensor_prime.shape());
+
+        for (int j = 0; j < 6; ++j) {
+          ASSERT_EQ(j + 1, tensor_prime.flat<int32>()(j));
+        }
+      }
+    }
+    closedir(dir);
+
+    ASSERT_EQ(1, dump_files_found);
+
+    // Remove temporary dump directory and file.
+    int64 undeleted_files = 0;
+    int64 undeleted_dirs = 0;
+    ASSERT_TRUE(env_->DeleteRecursively(dump_roots[i], &undeleted_files,
+                                        &undeleted_dirs)
+                    .ok());
+    ASSERT_EQ(0, undeleted_files);
+    ASSERT_EQ(0, undeleted_dirs);
+  }
+}
+
 TEST_F(DebugIdentityOpTest, Int32Success_2_3) {
   TF_ASSERT_OK(Init(DT_INT32));
   AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
@@ -66,8 +157,6 @@ TEST_F(DebugIdentityOpTest, StringSuccess) {
   test::ExpectTensorEqual<string>(expected, *GetOutput(0));
 }
 
-TEST_F(DebugIdentityOpTest, RefInputError) { TF_ASSERT_OK(Init(DT_INT32_REF)); }
-
 // Tests for DebugNanCountOp
 class DebugNanCountOpTest : public OpsTestBase {
  protected:
diff --git a/tensorflow/core/kernels/determinant_op.cc b/tensorflow/core/kernels/determinant_op.cc
index 00d44f5cacc..b2e69a8df5e 100644
--- a/tensorflow/core/kernels/determinant_op.cc
+++ b/tensorflow/core/kernels/determinant_op.cc
@@ -27,49 +27,36 @@ limitations under the License.
 
 namespace tensorflow {
 
-template <class Scalar, bool SupportsBatchOperationT>
-class DeterminantOp
-    : public UnaryLinearAlgebraOp<Scalar, SupportsBatchOperationT> {
+template <class Scalar, bool SupportsBatchOperation>
+class DeterminantOp : public LinearAlgebraOp<Scalar, SupportsBatchOperation> {
  public:
-  explicit DeterminantOp(OpKernelConstruction* context)
-      : UnaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {}
-  ~DeterminantOp() override {}
+  typedef LinearAlgebraOp<Scalar, SupportsBatchOperation> Base;
 
-  TensorShape GetOutputMatrixShape(
-      const TensorShape& input_matrix_shape) override {
-    return TensorShape({});
+  explicit DeterminantOp(OpKernelConstruction* context) : Base(context) {}
+
+  using TensorShapes = typename Base::TensorShapes;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shape) const final {
+    return TensorShapes({TensorShape({})});
   }
 
-  int64 GetCostPerUnit(const TensorShape& input_matrix_shape) override {
-    const int64 rows = input_matrix_shape.dim_size(0);
-    if (rows > (1LL << 20)) {
-      // A big number to cap the cost in case overflow.
-      return kint64max;
-    } else {
-      return rows * rows * rows;
-    }
-  }
-
-  using
-      typename UnaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>::MatrixMap;
-  using typename UnaryLinearAlgebraOp<Scalar,
-                                      SupportsBatchOperationT>::ConstMatrixMap;
-
-  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& input,
-                     MatrixMap* output) override {
-    OP_REQUIRES(context, input.rows() == input.cols(),
-                errors::InvalidArgument("Input matrix must be square."));
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
     Scalar determinant;
-    if (input.rows() == 0) {
-      // An empty matrix' determinant is defined to be 1.  See
-      // wikipedia.
+    if (inputs[0].rows() == 0) {
+      // An empty matrix' determinant is defined to be 1.  See wikipedia.
       determinant = 1;
     } else {
-      determinant = input.determinant();
+      determinant = inputs[0].determinant();
     }
-    OP_REQUIRES(context, std::isfinite(determinant),
-                errors::Internal("The determinant is not finite."));
-    (*output)(0, 0) = determinant;
+    // TODO(rmlarsen): Don't fail on infinite determinants, since that could
+    // be a valid result and the user should check for it instead.
+    OP_REQUIRES(context, Eigen::numext::isfinite(determinant),
+                errors::InvalidArgument("The determinant is not finite."));
+    outputs->at(0)(0, 0) = determinant;
   }
 };
 
diff --git a/tensorflow/core/kernels/example_parsing_ops_test.cc b/tensorflow/core/kernels/example_parsing_ops_test.cc
index 3f0db980856..e58cecff147 100644
--- a/tensorflow/core/kernels/example_parsing_ops_test.cc
+++ b/tensorflow/core/kernels/example_parsing_ops_test.cc
@@ -170,6 +170,7 @@ typedef BenchmarkOptions<ExampleStore<FloatFiller>, true> DenseFloat;
 #define BM_ParseExample(TYPE, B, K)                                      \
   static void BM_ParseExample##_##TYPE##_##B##_##K(int iters) {          \
     int64 items_per_iter = static_cast<int64>(B) * K;                    \
+    testing::UseRealTime();                                              \
     testing::ItemsProcessed(static_cast<int64>(iters) * items_per_iter); \
     test::Benchmark("cpu", ParseExample<TYPE>(B, K)).Run(iters);         \
   }                                                                      \
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index b4d9f03efc6..c2a5192efb1 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -16,13 +16,11 @@ limitations under the License.
 // See docs in ../ops/array_ops.cc.
 #define EIGEN_USE_THREADS
 
-#include <atomic>
-
+#include "tensorflow/core/kernels/gather_nd_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/gather_nd_op.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
@@ -155,97 +153,6 @@ class GatherNdOp : public OpKernel {
   }
 };
 
-// Specialization of GatherNdSlice to CPU
-namespace generator {
-
-template <typename T, typename Index, int IXDIM>
-class GatherNdSliceGenerator {
- public:
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE GatherNdSliceGenerator(
-      const Index slice_size, typename TTypes<Index>::ConstMatrix Tindices,
-      typename TTypes<T, IXDIM + 1>::ConstTensor Tparams,
-      typename TTypes<T>::Matrix Tout, std::atomic<Index>* error_loc)
-      : slice_size_(slice_size),
-        Tindices_(Tindices),
-        Tparams_(Tparams),
-        Tout_(Tout),
-        error_loc_(error_loc) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool GenerateIndices(
-      const Index loc, Eigen::array<Eigen::DenseIndex, IXDIM + 1>* ix) const {
-    (*ix)[IXDIM] = 0;
-    bool out_of_bounds = false;
-    for (int i = 0; i < IXDIM; ++i) {
-      const Index ix_i = internal::SubtleMustCopy(Tindices_(loc, i));
-      (*ix)[i] = ix_i;
-      out_of_bounds |= !FastBoundsCheck(ix_i, Tparams_.dimension(i));
-    }
-    return out_of_bounds;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int32
-  operator()(const Eigen::array<Eigen::DenseIndex, 1>& loc_array) const {
-    const Index loc = loc_array[0];
-    Eigen::array<Eigen::DenseIndex, IXDIM + 1> ix;
-    Eigen::array<Eigen::DenseIndex, 2> ix_out;
-    ix_out[0] = loc;
-    ix_out[1] = 0;
-    const bool out_of_bounds = GenerateIndices(loc, &ix);
-    if (TF_PREDICT_FALSE(out_of_bounds)) {
-      error_loc_->store(loc);
-      std::fill_n(&Tout_(ix_out), slice_size_, T());
-    } else {
-      std::copy_n(&Tparams_(ix), slice_size_, &Tout_(ix_out));
-    }
-
-    return static_cast<int32>(0);  // Return something...
-  }
-
- private:
-  const Index slice_size_;
-  const typename TTypes<Index>::ConstMatrix Tindices_;
-  const typename TTypes<T, IXDIM + 1>::ConstTensor Tparams_;
-  mutable typename TTypes<T>::Matrix Tout_;
-  std::atomic<Index>* error_loc_;
-};
-
-}  // namespace generator
-
-namespace functor {
-
-template <typename T, typename Index, int IXDIM>
-struct GatherNdSlice<CPUDevice, T, Index, IXDIM> {
-  Index operator()(const CPUDevice& d, const Index slice_size,
-                   typename TTypes<int32>::Scalar Tscratch,
-                   typename TTypes<T, IXDIM + 1>::ConstTensor Tparams,
-                   typename TTypes<Index>::ConstMatrix Tindices,
-                   typename TTypes<T>::Matrix Tout) {
-    std::atomic<Index> error_loc(-1);
-
-    const Eigen::DenseIndex batch_size = Tindices.dimension(0);
-#if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::Tensor<Eigen::DenseIndex, 1>::Dimensions reshape_dims{{ 1 }};
-    Eigen::array<Eigen::DenseIndex, 1> broadcast_dims{{ batch_size }};
-#else
-    Eigen::IndexList<Eigen::type2index<1> > reshape_dims;
-    Eigen::IndexList<Eigen::DenseIndex> broadcast_dims;
-    broadcast_dims.set(0, batch_size);
-#endif
-    generator::GatherNdSliceGenerator<T, Index, IXDIM> gather_nd_generator(
-        slice_size, Tindices, Tparams, Tout, &error_loc);
-    Tscratch.device(d) = Tscratch.reshape(reshape_dims)
-                             .broadcast(broadcast_dims)
-                             .generate(gather_nd_generator)
-                             .sum();
-
-    // error_loc() returns -1 if there's no out-of-bounds index,
-    // otherwise it returns the location of an OOB index in Tindices.
-    return error_loc.load();
-  }
-};
-
-}  // namespace functor
-
 #define REGISTER_GATHER_ND_FULL(dev, type, index_type)                 \
   REGISTER_KERNEL_BUILDER(Name("GatherNd")                             \
                               .Device(DEVICE_##dev)                    \
diff --git a/tensorflow/core/kernels/gather_nd_op.h b/tensorflow/core/kernels/gather_nd_op.h
index 0ee783bd593..d7279d5712a 100644
--- a/tensorflow/core/kernels/gather_nd_op.h
+++ b/tensorflow/core/kernels/gather_nd_op.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
new file mode 100644
index 00000000000..dc028c2f1e9
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
@@ -0,0 +1,145 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
+#define TENSORFLOW_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
+
+// Specialization of GatherNdSlice to CPU
+
+#define EIGEN_USE_THREADS
+
+#include <atomic>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/gather_nd_op.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+namespace generator {
+
+template <typename T, typename Index, int IXDIM>
+class GatherNdSliceGenerator {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE GatherNdSliceGenerator(
+      const Index slice_size, typename TTypes<Index>::ConstMatrix Tindices,
+      typename TTypes<T, IXDIM + 1>::ConstTensor Tparams,
+      typename TTypes<T>::Matrix Tout, std::atomic<Index>* error_loc)
+      : slice_size_(slice_size),
+        Tindices_(Tindices),
+        Tparams_(Tparams),
+        Tout_(Tout),
+        error_loc_(error_loc) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool GenerateIndices(
+      const Index loc, Eigen::array<Eigen::DenseIndex, IXDIM + 1>* ix) const {
+    (*ix)[IXDIM] = 0;
+    bool out_of_bounds = false;
+    for (int i = 0; i < IXDIM; ++i) {
+      const Index ix_i = internal::SubtleMustCopy(Tindices_(loc, i));
+      (*ix)[i] = ix_i;
+      out_of_bounds |= !FastBoundsCheck(ix_i, Tparams_.dimension(i));
+    }
+    return out_of_bounds;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int32
+  operator()(const Eigen::array<Eigen::DenseIndex, 1>& loc_array) const {
+    const Index loc = loc_array[0];
+    Eigen::array<Eigen::DenseIndex, IXDIM + 1> ix;
+    Eigen::array<Eigen::DenseIndex, 2> ix_out;
+    ix_out[0] = loc;
+    ix_out[1] = 0;
+    const bool out_of_bounds = GenerateIndices(loc, &ix);
+    if (TF_PREDICT_FALSE(out_of_bounds)) {
+      error_loc_->store(loc);
+      std::fill_n(&Tout_(ix_out), slice_size_, T());
+    } else {
+      std::copy_n(&Tparams_(ix), slice_size_, &Tout_(ix_out));
+    }
+
+    return static_cast<int32>(0);  // Return something...
+  }
+
+ private:
+  const Index slice_size_;
+  const typename TTypes<Index>::ConstMatrix Tindices_;
+  const typename TTypes<T, IXDIM + 1>::ConstTensor Tparams_;
+  mutable typename TTypes<T>::Matrix Tout_;
+  std::atomic<Index>* error_loc_;
+};
+
+}  // namespace generator
+
+namespace functor {
+
+template <typename T, typename Index, int IXDIM>
+struct GatherNdSlice<CPUDevice, T, Index, IXDIM> {
+  Index operator()(const CPUDevice& d, const Index slice_size,
+                   typename TTypes<int32>::Scalar Tscratch,
+                   typename TTypes<T, IXDIM + 1>::ConstTensor Tparams,
+                   typename TTypes<Index>::ConstMatrix Tindices,
+                   typename TTypes<T>::Matrix Tout) {
+    std::atomic<Index> error_loc(-1);
+
+    const Eigen::DenseIndex batch_size = Tindices.dimension(0);
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::Tensor<Eigen::DenseIndex, 1>::Dimensions reshape_dims{{ 1 }};
+    Eigen::array<Eigen::DenseIndex, 1> broadcast_dims{{ batch_size }};
+#else
+    Eigen::IndexList<Eigen::type2index<1> > reshape_dims;
+    Eigen::IndexList<Eigen::DenseIndex> broadcast_dims;
+    broadcast_dims.set(0, batch_size);
+#endif
+    generator::GatherNdSliceGenerator<T, Index, IXDIM> gather_nd_generator(
+        slice_size, Tindices, Tparams, Tout, &error_loc);
+    Tscratch.device(d) = Tscratch.reshape(reshape_dims)
+                             .broadcast(broadcast_dims)
+                             .generate(gather_nd_generator)
+                             .sum();
+
+    // error_loc() returns -1 if there's no out-of-bounds index,
+    // otherwise it returns the location of an OOB index in Tindices.
+    return error_loc.load();
+  }
+};
+
+#define REGISTER_GATHER_ND_FULL(T, Index)                                     \
+  template Index GatherNdSlice<CPUDevice, T, Index, CPU_PROVIDED_IXDIM>::     \
+  operator()(const CPUDevice& d, const Index slice_size,                      \
+             typename TTypes<int32>::Scalar Tscratch,                         \
+             typename TTypes<T, CPU_PROVIDED_IXDIM + 1>::ConstTensor Tparams, \
+             typename TTypes<Index>::ConstMatrix Tindices,                    \
+             typename TTypes<T>::Matrix Tout);
+
+#define REGISTER_GATHER_ND_CPU(type)    \
+  REGISTER_GATHER_ND_FULL(type, int32); \
+  REGISTER_GATHER_ND_FULL(type, int64)
+
+TF_CALL_ALL_TYPES(REGISTER_GATHER_ND_CPU);
+
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_GATHER_ND_OP_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_0.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_0.cc
new file mode 100644
index 00000000000..246e9f729b8
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_0.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 0
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_1.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_1.cc
new file mode 100644
index 00000000000..5b7720fc4ef
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_1.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 1
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_2.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_2.cc
new file mode 100644
index 00000000000..0f6932394ed
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_2.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 2
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_3.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_3.cc
new file mode 100644
index 00000000000..1c2aec7820a
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_3.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 3
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_4.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_4.cc
new file mode 100644
index 00000000000..3e164668c5b
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_4.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 4
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_5.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_5.cc
new file mode 100644
index 00000000000..7141ea70df9
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_5.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 5
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
index 41c0a2cdd72..575c7e2e7c2 100644
--- a/tensorflow/core/kernels/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -15,98 +15,234 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/linalg_ops_common.h"
 
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
 namespace tensorflow {
 
-void UnaryLinearAlgebraOpBase::Compute(OpKernelContext* context) {
-  const Tensor& in = context->input(0);
+// static
+template <typename Scalar, bool SupportsBatchOperation>
+void LinearAlgebraOp<Scalar, SupportsBatchOperation>::ValidateSingleMatrix(
+    OpKernelContext* context, const TensorShapes& input_matrix_shapes) {
+  OP_REQUIRES(context, input_matrix_shapes.size() == 1,
+              errors::InvalidArgument("Expected a single input matrix, got %d.",
+                                      input_matrix_shapes.size()));
+  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_matrix_shapes[0]),
+              errors::InvalidArgument("Input must be a matrix."));
+}
 
-  const int input_rank = in.dims();
-  if (SupportsBatchOperation()) {
-    OP_REQUIRES(context, input_rank >= 2,
-                errors::InvalidArgument("Input tensor must have rank >= 2"));
-  } else {
-    OP_REQUIRES(context, input_rank == 2,
-                errors::InvalidArgument("Input tensor must have rank == 2"));
-  }
+// static
+template <typename Scalar, bool SupportsBatchOperation>
+void LinearAlgebraOp<Scalar, SupportsBatchOperation>::
+    ValidateSingleSquareMatrix(OpKernelContext* context,
+                               const TensorShapes& input_matrix_shapes) {
+  OP_REQUIRES(context, input_matrix_shapes.size() == 1,
+              errors::InvalidArgument("Expected a single input matrix, got %d.",
+                                      input_matrix_shapes.size()));
+  OP_REQUIRES(context, TensorShapeUtils::IsSquareMatrix(input_matrix_shapes[0]),
+              errors::InvalidArgument("Input matrix must be square."));
+}
 
-  // If the tensor rank is greater than input_rank, we consider the inner-most
-  // dimensions as matrices, and loop over all the other outer
-  // dimensions to compute the results.
-  const int row_dimension = input_rank - 2;
-  const int col_dimension = input_rank - 1;
-  const int64 num_rows = in.dim_size(row_dimension);
-  const int64 num_cols = in.dim_size(col_dimension);
-  const TensorShape input_matrix_shape = TensorShape({num_rows, num_cols});
-  const TensorShape output_matrix_shape =
-      GetOutputMatrixShape(input_matrix_shape);
-  OP_REQUIRES(context, output_matrix_shape.dims() <= 2,
-              errors::InvalidArgument("Output rank must be 1 or 2."));
+// static
+template <typename Scalar, bool SupportsBatchOperation>
+void LinearAlgebraOp<Scalar, SupportsBatchOperation>::ValidateSolver(
+    OpKernelContext* context, const TensorShapes& input_matrix_shapes) {
+  OP_REQUIRES(context, input_matrix_shapes.size() == 2,
+              errors::InvalidArgument("Expected two input matrices, got %d.",
+                                      input_matrix_shapes.size()));
+  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_matrix_shapes[0]),
+              errors::InvalidArgument("First input (lhs) must be a matrix."));
+  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_matrix_shapes[1]),
+              errors::InvalidArgument("Second input (rhs) must be a matrix."));
+  OP_REQUIRES(
+      context,
+      input_matrix_shapes[0].dim_size(0) == input_matrix_shapes[1].dim_size(0),
+      errors::InvalidArgument("Input matrix and rhs are incompatible."));
+}
 
-  int num_matrices = 1;
-  // The output has the shape of all the outer dimensions of the input
-  // except for the last two, plus the output_matrix_shape (if the output
-  // is not scalar). This assumes that each input matrix is
-  // 2-dimensional.
-  TensorShape output_shape;
-  if (input_rank == 2) {
-    output_shape = output_matrix_shape;
-  } else {
-    // Add the common outer dimensions.
-    for (int dim = 0; dim < input_rank - 2; ++dim) {
-      num_matrices *= in.dim_size(dim);
-      output_shape.AddDim(in.dim_size(dim));
-    }
-    // Add the inner dimensions that depend on the operation implemented by the
-    // derived class.
-    for (int dim = 0; dim < output_matrix_shape.dims(); ++dim) {
-      output_shape.AddDim(output_matrix_shape.dim_size(dim));
-    }
-  }
+// static
+template <typename Scalar, bool SupportsBatchOperation>
+void LinearAlgebraOp<Scalar, SupportsBatchOperation>::ValidateSquareSolver(
+    OpKernelContext* context, const TensorShapes& input_matrix_shapes) {
+  OP_REQUIRES(context, input_matrix_shapes.size() == 2,
+              errors::InvalidArgument("Expected two input matrices, got %d.",
+                                      input_matrix_shapes.size()));
+  OP_REQUIRES(
+      context, TensorShapeUtils::IsSquareMatrix(input_matrix_shapes[0]),
+      errors::InvalidArgument("First input (lhs) must be a square matrix."));
+  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_matrix_shapes[1]),
+              errors::InvalidArgument("Second input (rhs) must be a matrix."));
+  OP_REQUIRES(
+      context,
+      input_matrix_shapes[0].dim_size(0) == input_matrix_shapes[1].dim_size(0),
+      errors::InvalidArgument("Input matrix and rhs are incompatible."));
+}
 
-  Tensor* out = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &out));
+template <typename Scalar, bool SupportsBatchOperation>
+void LinearAlgebraOp<Scalar, SupportsBatchOperation>::Compute(
+    OpKernelContext* context) {
+  TensorInputs inputs;
+  TensorShapes input_matrix_shapes;
+  TensorShape batch_shape;
+  AnalyzeInputs(context, &inputs, &input_matrix_shapes, &batch_shape);
 
-  auto shard = [this, &in, &input_matrix_shape, &output_matrix_shape, context,
-                out](int64 begin, int64 end) {
+  TensorShapes output_matrix_shapes;
+  TensorOutputs outputs;
+  PrepareOutputs(context, input_matrix_shapes, batch_shape, &outputs,
+                 &output_matrix_shapes);
+
+  // Process the individual matrix problems in parallel using a threadpool.
+  auto shard = [this, &inputs, &input_matrix_shapes, &outputs,
+                &output_matrix_shapes, context](int64 begin, int64 end) {
     for (int64 i = begin; i < end; ++i) {
-      ComputeMatrix(context, i, in, input_matrix_shape, out,
-                    output_matrix_shape);
+      ComputeTensorSlice(context, i, inputs, input_matrix_shapes, outputs,
+                         output_matrix_shapes);
     }
   };
-
   auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-  Shard(worker_threads.num_threads, worker_threads.workers, num_matrices,
-        GetCostPerUnit(input_matrix_shape), shard);
+  Shard(worker_threads.num_threads, worker_threads.workers,
+        batch_shape.num_elements(), GetCostPerUnit(input_matrix_shapes), shard);
 }
 
-template <typename Scalar, bool SupportsBatchOperationT>
-void UnaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>::ComputeMatrix(
-    OpKernelContext* context, int64 matrix_index, const Tensor& in,
-    const TensorShape& input_matrix_shape, Tensor* out,
-    const TensorShape& output_matrix_shape) {
-  // TODO(kalakris): Handle alignment if possible. Eigen::Map is
-  // unaligned by default.
-  ConstMatrixMap input(in.flat<Scalar>().data() +
-                           matrix_index * input_matrix_shape.num_elements(),
-                       input_matrix_shape.dim_size(0),
-                       input_matrix_shape.dim_size(1));
+template <typename Scalar, bool SupportsBatchOperation>
+void LinearAlgebraOp<Scalar, SupportsBatchOperation>::AnalyzeInputs(
+    OpKernelContext* context, TensorInputs* inputs,
+    TensorShapes* input_matrix_shapes, TensorShape* batch_shape) {
+  int input_rank = -1;
+  for (int i = 0; i < NumMatrixInputs(context); ++i) {
+    const Tensor& in = context->input(i);
+    if (i == 0) {
+      input_rank = in.dims();
+      if (SupportsBatchOperation) {
+        OP_REQUIRES(
+            context, input_rank >= 2,
+            errors::InvalidArgument("Input tensor ", i,
+                                    " must have rank >= 2, got", input_rank));
+      } else {
+        OP_REQUIRES(
+            context, input_rank == 2,
+            errors::InvalidArgument("Input tensor ", i,
+                                    " must have rank == 2, got", input_rank));
+      }
 
-  // The output matrix shape may not be a matrix.
-  int num_output_rows =
-      output_matrix_shape.dims() >= 1 ? output_matrix_shape.dim_size(0) : 1;
-  int num_output_cols =
-      output_matrix_shape.dims() == 2 ? output_matrix_shape.dim_size(1) : 1;
-  MatrixMap output(out->flat<Scalar>().data() +
-                       matrix_index * output_matrix_shape.num_elements(),
-                   num_output_rows, num_output_cols);
-  ComputeMatrix(context, input, &output);
+      // If the tensor rank is greater than 2, we consider the inner-most
+      // dimensions as matrices, and loop over all the other outer ("batch")
+      // dimensions to compute the results.
+      for (int dim = 0; dim < input_rank - 2; ++dim) {
+        batch_shape->AddDim(in.dim_size(dim));
+      }
+    } else {
+      // Make sure that all inputs have the same rank and outer dimensions.
+      OP_REQUIRES(context, input_rank == in.dims(),
+                  errors::InvalidArgument(
+                      "All input tensors must have the same rank."));
+      for (int dim = 0; dim < input_rank - 2; ++dim) {
+        OP_REQUIRES(
+            context, in.dim_size(dim) == batch_shape->dim_size(dim),
+            errors::InvalidArgument(
+                "All input tensors must have the same outer dimensions."));
+      }
+    }
+
+    const int row_dimension = input_rank - 2;
+    const int col_dimension = input_rank - 1;
+    const int64 num_rows = in.dim_size(row_dimension);
+    const int64 num_cols = in.dim_size(col_dimension);
+    // TODO(rmlarsen): Use emplace_back when it is added to InlinedVector. Same
+    // in several places below.
+    input_matrix_shapes->push_back(TensorShape({num_rows, num_cols}));
+    inputs->push_back(in);
+  }
+  // Have the derived class validate that the inputs are as expected.
+  ValidateInputMatrixShapes(context, *input_matrix_shapes);
 }
 
-// Explicitly instantiate UnaryLinearAlgebraOp for the scalar types we expect to
+template <typename Scalar, bool SupportsBatchOperation>
+void LinearAlgebraOp<Scalar, SupportsBatchOperation>::PrepareOutputs(
+    OpKernelContext* context, const TensorShapes& input_matrix_shapes,
+    const TensorShape& batch_shape, TensorOutputs* outputs,
+    TensorShapes* output_matrix_shapes) {
+  // Get shape for each of the matrix outputs produced by the derived class.
+  *output_matrix_shapes = GetOutputMatrixShapes(input_matrix_shapes);
+  const int num_outputs = output_matrix_shapes->size();
+
+  // Make sure the number of op outputs is what the derived class expects.
+  OP_REQUIRES(
+      context, num_outputs <= context->num_outputs(),
+      errors::Internal(
+          "Derived class expected more outputs (%d) that the op has (%d).",
+          num_outputs, context->num_outputs()));
+
+  // Allocate outputs.
+  for (int i = 0; i < context->num_outputs(); ++i) {
+    TensorShape output_tensor_shape({0});
+    if (i < num_outputs) {
+      // This output is used, set up output shape and allocate it.
+      const TensorShape& output_matrix_shape = output_matrix_shapes->at(i);
+      OP_REQUIRES(context, output_matrix_shape.dims() <= 2,
+                  errors::InvalidArgument(
+                      "Rank of matrix output no. %d must be 0, 1 or 2, got %d.",
+                      i, output_matrix_shape.dims()));
+
+      // The final output has the shape of the outer batch dimensions
+      // concatenated with the output_matrix_shape (if the output is not
+      // scalar).
+      output_tensor_shape = batch_shape;
+      for (int dim = 0; dim < output_matrix_shape.dims(); ++dim) {
+        output_tensor_shape.AddDim(output_matrix_shape.dim_size(dim));
+      }
+    }
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(i, output_tensor_shape, &out));
+    outputs->push_back(out);
+  }
+}
+
+template <typename Scalar, bool SupportsBatchOperation>
+void LinearAlgebraOp<Scalar, SupportsBatchOperation>::ComputeTensorSlice(
+    OpKernelContext* context, int64 matrix_index, const TensorInputs& inputs,
+    const TensorShapes& input_matrix_shapes, const TensorOutputs& outputs,
+    const TensorShapes& output_matrix_shapes) {
+  ConstMatrixMaps matrix_inputs;
+  for (int i = 0; i < inputs.size(); ++i) {
+    // TODO(kalakris): Handle alignment if possible. Eigen::Map is
+    // unaligned by default.
+    matrix_inputs.push_back(
+        ConstMatrixMap(inputs[i].flat<Scalar>().data() +
+                           matrix_index * input_matrix_shapes[i].num_elements(),
+                       input_matrix_shapes[i].dim_size(0),
+                       input_matrix_shapes[i].dim_size(1)));
+  }
+
+  MatrixMaps matrix_outputs;
+  for (int i = 0; i < output_matrix_shapes.size(); ++i) {
+    // The output matrix shape may not be a matrix.
+    int num_output_rows = output_matrix_shapes[i].dims() >= 1
+                              ? output_matrix_shapes[i].dim_size(0)
+                              : 1;
+    int num_output_cols = output_matrix_shapes[i].dims() == 2
+                              ? output_matrix_shapes[i].dim_size(1)
+                              : 1;
+    matrix_outputs.push_back(
+        MatrixMap(outputs[i]->flat<Scalar>().data() +
+                      matrix_index * output_matrix_shapes[i].num_elements(),
+                  num_output_rows, num_output_cols));
+  }
+  ComputeMatrix(context, matrix_inputs, &matrix_outputs);
+}
+
+// Explicitly instantiate LinearAlgebraOp for the scalar types we expect to
 // use.
-template class UnaryLinearAlgebraOp<float, false>;
-template class UnaryLinearAlgebraOp<float, true>;
-template class UnaryLinearAlgebraOp<double, false>;
-template class UnaryLinearAlgebraOp<double, true>;
+template class LinearAlgebraOp<float, false>;
+template class LinearAlgebraOp<float, true>;
+template class LinearAlgebraOp<double, false>;
+template class LinearAlgebraOp<double, true>;
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/linalg_ops_common.h b/tensorflow/core/kernels/linalg_ops_common.h
index 25e9ddafca4..3be9853c6cf 100644
--- a/tensorflow/core/kernels/linalg_ops_common.h
+++ b/tensorflow/core/kernels/linalg_ops_common.h
@@ -19,9 +19,11 @@ limitations under the License.
 // Classes to support linear algebra functionality, similar to the numpy.linalg
 // module. Supports batch computation on several matrices at once, sharding the
 // computations across different threads if necessary.
+#include <algorithm>
 
 #define EIGEN_USE_THREADS
 
+#include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -30,97 +32,142 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 
-// Base class for unary linear algebra operators.
-class UnaryLinearAlgebraOpBase : public OpKernel {
+// Base class for linear algebra operators.
+template <typename Scalar, bool SupportsBatchOperationT>
+class LinearAlgebraOp : public OpKernel {
  public:
-  explicit UnaryLinearAlgebraOpBase(OpKernelConstruction* context)
-      : OpKernel(context) {}
-  ~UnaryLinearAlgebraOpBase() override {}
-
-  // Return the output shape of each individual matrix operation. Must be
-  // rank 0, 1, or 2.  Scalar outputs are rank 0.
-  virtual TensorShape GetOutputMatrixShape(
-      const TensorShape& input_matrix_shape) = 0;
-
-  // Return the cost per matrix operation. Cost per unit is assumed to be
-  // roughly 1ns, based on comments in core/util/work_sharder.cc.
-  virtual int64 GetCostPerUnit(const TensorShape& input_matrix_shape) = 0;
-
-  // If SupportsBatchOperation() returns false, this Op will only accept rank 2
-  // (if the supported input type is a matrix). If it returns true, the Op will
-  // accept inputs of rank >= 3, and repeatedly execute the operation on all
-  // matrices in the innermost two dimensions.
-  virtual bool SupportsBatchOperation() = 0;
-
-  // Perform the actual computation on an input matrix, and store the results
-  // in the output. This will be called repeatedly for a single call to
-  // Compute(), if multiple matrices exist in the input Tensor.
-  //
-  // This function should only compute the results for a single input matrix.
-  // The 'matrix_index' parameter specifies the index of the matrix to be used
-  // from the input, and the index of the matrix to be written to in the output.
-  // The input matrix is in row major order, and is located at the memory
-  // address
-  //   in.flat<Scalar>().data() +
-  //   matrix_index * input_matrix_shape.num_elements().
-  // The output matrix is in row major order, and is located at the memory
-  // address
-  //   out->flat<Scalar>().data() +
-  //   matrix_index * output_matrix_shape.num_elements().
-  // The UnaryLinearAlgebraOp<Scalar> class below has functionality which
-  // performs
-  // this mapping and presents an interface based on the Eigen::MatrixBase API.
-  virtual void ComputeMatrix(OpKernelContext* context, int64 matrix_index,
-                             const Tensor& in,
-                             const TensorShape& input_matrix_shape, Tensor* out,
-                             const TensorShape& output_matrix_shape) = 0;
+  explicit LinearAlgebraOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override;
-};
 
-// This base class encapsulates the functionality of mapping the input and
-// output tensors using Eigen::Map, so that the Eigen::MatrixBase API may be
-// directly used by derived classes.
-// SupportsBatchOperationT is a bool template argument which if set to true
-// will allow the Op to process batches of matrices (rank >= 3); if set to
-// false the Op will only accept rank 2 inputs.
-template <typename Scalar, bool SupportsBatchOperationT>
-class UnaryLinearAlgebraOp : public UnaryLinearAlgebraOpBase {
- public:
-  explicit UnaryLinearAlgebraOp(OpKernelConstruction* context)
-      : UnaryLinearAlgebraOpBase(context) {}
+ protected:
+  using TensorShapes = gtl::InlinedVector<TensorShape, 4>;
+  // Returns the number of leading inputs that are to be treated as matrix
+  // inputs. By default this is all the inputs. Derived classes can override
+  // this to tell the base class to ignore one or more trailing inputs.
+  virtual int NumMatrixInputs(const OpKernelContext* context) const {
+    return context->num_inputs();
+  }
+
+  // Returns true if the number of inputs and their shapes are as expected.
+  // Many ops take a single square input matrix, so we provide that as a default
+  // implementation for convenience.
+  virtual void ValidateInputMatrixShapes(
+      OpKernelContext* context, const TensorShapes& input_matrix_shapes) const {
+    ValidateSingleSquareMatrix(context, input_matrix_shapes);
+  }
+
+  // Convenience validators for common cases:
+  //
+  // Validate op taking a single matrix A.
+  static void ValidateSingleMatrix(OpKernelContext* context,
+                                   const TensorShapes& input_matrix_shapes);
+  // Validate op taking a single square matrix A.
+  static void ValidateSingleSquareMatrix(
+      OpKernelContext* context, const TensorShapes& input_matrix_shapes);
+  // Validate op taking two matrices A and B that have the same number of rows.
+  static void ValidateSolver(OpKernelContext* context,
+                             const TensorShapes& input_matrix_shapes);
+  // Validate op taking two matrices A and B that have the same number of rows
+  // and A is square.
+  static void ValidateSquareSolver(OpKernelContext* context,
+                                   const TensorShapes& input_matrix_shapes);
+
+  // Returns the output shapes of each individual matrix operation. Output
+  // matrices shapes must be rank 0, 1, or 2. Scalar outputs are rank 0.
+  //
+  // The derived class may return a number of shapes (N) less than
+  // context->num_outputs() (M) to indicate that a only leading subset of
+  // the outputs will be populated. In this case, a dummy scalar tensor with
+  // value zero will be return for the last M-N outputs.
+  //
+  // For many ops, the output dimensions are the same as the input dimensions,
+  // so we provide that as a default implementation for convenience.
+  virtual TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const {
+    return input_matrix_shapes;
+  }
+
+  // Returns the cost per matrix operation. This is used to determine the
+  // number of threads to use for parallelizing calls to ComputeMatrix in
+  // batch mode. Cost per unit is assumed to be roughly 1ns, based on comments
+  // in core/util/work_sharder.cc. Many linear algebra ops take roughly max(m,n)
+  // * min(m,n)^2, where the first input matrix is m-by-n. We provide that as a
+  // default implementation for convenience.
+  virtual int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const {
+    double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
+    double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
+    double cost = std::max(m, n) * std::min(m, n) * std::min(m, n);
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
 
   using Matrix =
       Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
   using ConstMatrixMap = Eigen::Map<const Matrix>;
   using MatrixMap = Eigen::Map<Matrix>;
+  using ConstMatrixMaps = gtl::InlinedVector<ConstMatrixMap, 4>;
+  using MatrixMaps = gtl::InlinedVector<MatrixMap, 4>;
 
-  // Perform the actual computation on the input matrix, and store the results
-  // in the output. This will be called repeatedly for a single call to
-  // Compute(), if multiple matrices exist in the input Tensor.
+  // Performs a single matrix computation given input matrices, and
+  // stores the result in outputs. For batch operations, this will be called
+  // repeatedly for a single call to Compute() when multiple matrices exist in
+  // input Tensors with rank > 2. In this case the calls to ComputeMatrix are
+  // parallelized. The number of threads used is determined by a cost model from
+  // the value returned by GetCostPerUnit().
   virtual void ComputeMatrix(OpKernelContext* context,
-                             const ConstMatrixMap& input,
-                             MatrixMap* output) = 0;
+                             const ConstMatrixMaps& inputs,
+                             MatrixMaps* outputs) = 0;
 
-  bool SupportsBatchOperation() final { return SupportsBatchOperationT; }
+ private:
+  using TensorInputs = gtl::InlinedVector<Tensor, 4>;
+  using TensorOutputs = gtl::InlinedVector<Tensor*, 4>;
 
-  // A concrete implementation of UnaryLinearAlgebraOpBase::ComputeMatrix().
-  void ComputeMatrix(OpKernelContext* context, int64 matrix_index,
-                     const Tensor& in, const TensorShape& input_matrix_shape,
-                     Tensor* out, const TensorShape& output_matrix_shape) final;
+  // This function maps slices (matrices) of the input and output tensors using
+  // Eigen::Map and calls ComputeMatrix implemented in terms of the
+  // Eigen::MatrixBase API by the derived class.
+  //
+  // The 'matrix_index' parameter specifies the index of the matrix to be used
+  // from each input tensor, and the index of the matrix to be written to each
+  // output tensor. The input matrices are in row major order, and located at
+  // the memory addresses
+  //   inputs[i].flat<Scalar>().data() +
+  //   matrix_index * input_matrix_shapes[i].num_elements()
+  // for i in 0...inputs.size()-1.
+  // The output matrices are in row major order, and located at the memory
+  // address
+  //   outputs[i]->flat<Scalar>().data() +
+  //   matrix_index * output_matrix_shapes[i].num_elements().
+  // for i in 0...outputs.size()-1.
+  //
+  void ComputeTensorSlice(OpKernelContext* context, int64 matrix_index,
+                          const TensorInputs& inputs,
+                          const TensorShapes& input_matrix_shapes,
+                          const TensorOutputs& outputs,
+                          const TensorShapes& output_matrix_shapes);
+
+  void AnalyzeInputs(OpKernelContext* context, TensorInputs* inputs,
+                     TensorShapes* input_matrix_shapes,
+                     TensorShape* batch_shape);
+
+  void PrepareOutputs(OpKernelContext* context,
+                      const TensorShapes& input_matrix_shapes,
+                      const TensorShape& batch_shape, TensorOutputs* outputs,
+                      TensorShapes* output_matrix_shapes);
 };
 
-// Declare that UnaryLinearAlgebraOp is explicitly instantiated in
+// Declare that LinearAlgebraOp is explicitly instantiated in
 // linalg_ops_common.cc for float and double.
-extern template class UnaryLinearAlgebraOp<float, false>;
-extern template class UnaryLinearAlgebraOp<float, true>;
-extern template class UnaryLinearAlgebraOp<double, false>;
-extern template class UnaryLinearAlgebraOp<double, true>;
+extern template class LinearAlgebraOp<float, false>;
+extern template class LinearAlgebraOp<float, true>;
+extern template class LinearAlgebraOp<double, false>;
+extern template class LinearAlgebraOp<double, true>;
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/matrix_inverse_op.cc b/tensorflow/core/kernels/matrix_inverse_op.cc
index 94fa4a3a23a..d548e3f65be 100644
--- a/tensorflow/core/kernels/matrix_inverse_op.cc
+++ b/tensorflow/core/kernels/matrix_inverse_op.cc
@@ -14,9 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 // See docs in ../ops/linalg_ops.cc.
-#include <cmath>
 
-#include "third_party/eigen3/Eigen/Cholesky"
+#include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/Eigen/LU"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -29,40 +28,23 @@ limitations under the License.
 
 namespace tensorflow {
 
-template <class Scalar, bool SupportsBatchOperationT>
-class MatrixInverseOp
-    : public UnaryLinearAlgebraOp<Scalar, SupportsBatchOperationT> {
+template <class Scalar, bool SupportsBatchOperation>
+class MatrixInverseOp : public LinearAlgebraOp<Scalar, SupportsBatchOperation> {
  public:
-  explicit MatrixInverseOp(OpKernelConstruction* context)
-      : UnaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {
+  typedef LinearAlgebraOp<Scalar, SupportsBatchOperation> Base;
+
+  explicit MatrixInverseOp(OpKernelConstruction* context) : Base(context) {
     OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
   }
-  ~MatrixInverseOp() override {}
 
-  TensorShape GetOutputMatrixShape(
-      const TensorShape& input_matrix_shape) override {
-    return input_matrix_shape;
-  }
-
-  int64 GetCostPerUnit(const TensorShape& input_matrix_shape) override {
-    const int64 rows = input_matrix_shape.dim_size(0);
-    if (rows > (1LL << 20)) {
-      // A big number to cap the cost in case overflow.
-      return kint64max;
-    } else {
-      return rows * rows * rows;
-    }
-  }
-
-  typedef UnaryLinearAlgebraOp<Scalar, SupportsBatchOperationT> Base;
   using Matrix = typename Base::Matrix;
-  using MatrixMap = typename Base::MatrixMap;
+  using MatrixMaps = typename Base::MatrixMaps;
   using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
 
-  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& input,
-                     MatrixMap* output) override {
-    OP_REQUIRES(context, input.rows() == input.cols(),
-                errors::InvalidArgument("Input matrix must be square."));
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const ConstMatrixMap& input = inputs[0];
     if (input.rows() == 0) {
       // By definition, an empty matrix's inverse is an empty matrix.
       return;
@@ -86,7 +68,7 @@ class MatrixInverseOp
         lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff();
     OP_REQUIRES(context, min_abs_pivot > Scalar(0),
                 errors::InvalidArgument("Input is not invertible."));
-    output->noalias() = lu_decomposition.inverse();
+    outputs->at(0).noalias() = lu_decomposition.inverse();
   }
 
  private:
diff --git a/tensorflow/core/kernels/matrix_solve_ls_op.cc b/tensorflow/core/kernels/matrix_solve_ls_op.cc
index 2d3239cfe66..9ee3f2c924f 100644
--- a/tensorflow/core/kernels/matrix_solve_ls_op.cc
+++ b/tensorflow/core/kernels/matrix_solve_ls_op.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 // See docs in ../ops/linalg_ops.cc.
-#include <cmath>
 
 #include "third_party/eigen3/Eigen/Cholesky"
 #include "third_party/eigen3/Eigen/Core"
@@ -22,72 +21,68 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/binary_linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-template <class Scalar, bool SupportsBatchOperationT>
-class MatrixSolveLsOp
-    : public BinaryLinearAlgebraOp<Scalar, SupportsBatchOperationT> {
+template <class Scalar, bool SupportsBatchOperation>
+class MatrixSolveLsOp : public LinearAlgebraOp<Scalar, SupportsBatchOperation> {
  public:
-  explicit MatrixSolveLsOp(OpKernelConstruction* context)
-      : BinaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {
+  typedef LinearAlgebraOp<Scalar, SupportsBatchOperation> Base;
+
+  explicit MatrixSolveLsOp(OpKernelConstruction* context) : Base(context) {
     OP_REQUIRES_OK(context, context->GetAttr("fast", &fast_));
   }
 
-  ~MatrixSolveLsOp() override {}
+  using TensorShapes = typename Base::TensorShapes;
+  using Matrix = typename Base::Matrix;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
 
-  TensorShape GetOutputMatrixShape(
-      const TensorShape& input_matrix_shape,
-      const TensorShape& rhs_matrix_shape) override {
-    CHECK_EQ(input_matrix_shape.dims(), rhs_matrix_shape.dims());
-    TensorShape output_matrix_shape = rhs_matrix_shape;
-    output_matrix_shape.set_dim(
-        output_matrix_shape.dims() - 2,
-        input_matrix_shape.dim_size(output_matrix_shape.dims() - 1));
-    return output_matrix_shape;
+  // Tell the base class to ignore the regularization parameter
+  // in context->input(2).
+  int NumMatrixInputs(const OpKernelContext* context) const final { return 2; }
+
+  virtual void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    Base::ValidateSolver(context, input_matrix_shapes);
   }
 
-  int64 GetCostPerUnit(const TensorShape& input_matrix_shape,
-                       const TensorShape& rhs_matrix_shape) override {
-    const int64 rows = input_matrix_shape.dim_size(0);
-    const int64 rhss = rhs_matrix_shape.dim_size(1);
-    if (rows > (1LL << 20)) {
-      // A big number to cap the cost in case overflow.
-      return kint32max;
-    } else {
-      return 2 * rows * rows * (rows + rhss);
-    }
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    return TensorShapes({TensorShape({input_matrix_shapes[0].dim_size(1),
+                                      input_matrix_shapes[1].dim_size(1)})});
   }
 
-  typedef
-      typename BinaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>::Matrix
-          Matrix;
-  typedef
-      typename BinaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>::MatrixMap
-          MatrixMap;
-  typedef typename BinaryLinearAlgebraOp<
-      Scalar, SupportsBatchOperationT>::ConstMatrixMap ConstMatrixMap;
+  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
+    double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
+    double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
+    double cost = std::max(m, n) * std::min(m, n) * (std::min(m, n) + num_rhss);
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
 
-  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& matrix,
-                     const ConstMatrixMap& rhs, MatrixMap* output) override {
-    const int64 rows = matrix.rows();
-    const int64 cols = matrix.cols();
-    OP_REQUIRES(
-        context, rows == rhs.rows(),
-        errors::InvalidArgument("Input matrix and rhs are incompatible."));
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const ConstMatrixMap& matrix = inputs[0];
+    const ConstMatrixMap& rhs = inputs[1];
     const auto& l2_regularizer_in = context->input(2);
     OP_REQUIRES(
         context, TensorShapeUtils::IsScalar(l2_regularizer_in.shape()),
         errors::InvalidArgument("l2_regularizer must be scalar, got shape ",
                                 l2_regularizer_in.shape().DebugString()));
     const double l2_regularizer = l2_regularizer_in.scalar<double>()();
-
     OP_REQUIRES(context, l2_regularizer >= 0,
                 errors::InvalidArgument("l2_regularizer must be >= 0."));
+
+    const int64 rows = matrix.rows();
+    const int64 cols = matrix.cols();
     if (rows == 0 || cols == 0) {
       // The result is the empty matrix.
       return;
@@ -119,7 +114,7 @@ class MatrixSolveLsOp
             errors::InvalidArgument("Input matrix was rank deficient or "
                                     "ill-conditioned. Try setting fast=False "
                                     "or provide a larger l2_regularizer > 0."));
-        *output = llt.solve(matrix.transpose() * rhs);
+        outputs->at(0) = llt.solve(matrix.transpose() * rhs);
       } else {
         // Underdetermined case (rows < cols): Solves the minimum-norm problem
         //   min ||X||_F^2 s.t. A*X = RHS
@@ -139,7 +134,7 @@ class MatrixSolveLsOp
             errors::InvalidArgument("Input matrix was rank deficient or "
                                     "ill-conditioned. Try setting fast=False "
                                     "or provide an l2_regularizer > 0."));
-        *output = matrix.transpose() * llt.solve(rhs);
+        outputs->at(0) = matrix.transpose() * llt.solve(rhs);
       }
     } else {
       // Use complete orthogonal decomposition which is backwards stable and
@@ -152,7 +147,7 @@ class MatrixSolveLsOp
       //   the equivalent blocked LAPACK routine xGELSY (e.g. Eigen is ~3x
       //   slower for 4k x 4k matrices).
       //   See http://www.netlib.org/lapack/lawnspdf/lawn114.pdf
-      *output = matrix.completeOrthogonalDecomposition().solve(rhs);
+      outputs->at(0) = matrix.completeOrthogonalDecomposition().solve(rhs);
     }
   }
 
@@ -160,13 +155,10 @@ class MatrixSolveLsOp
   bool fast_;
 };
 
-REGISTER_BINARY_LINALG_OP("MatrixSolveLs", (MatrixSolveLsOp<float, false>),
-                          float);
-REGISTER_BINARY_LINALG_OP("MatrixSolveLs", (MatrixSolveLsOp<double, false>),
-                          double);
-REGISTER_BINARY_LINALG_OP("BatchMatrixSolveLs", (MatrixSolveLsOp<float, true>),
-                          float);
-REGISTER_BINARY_LINALG_OP("BatchMatrixSolveLs", (MatrixSolveLsOp<double, true>),
-                          double);
+REGISTER_LINALG_OP("MatrixSolveLs", (MatrixSolveLsOp<float, false>), float);
+REGISTER_LINALG_OP("MatrixSolveLs", (MatrixSolveLsOp<double, false>), double);
+REGISTER_LINALG_OP("BatchMatrixSolveLs", (MatrixSolveLsOp<float, true>), float);
+REGISTER_LINALG_OP("BatchMatrixSolveLs", (MatrixSolveLsOp<double, true>),
+                   double);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_solve_op.cc b/tensorflow/core/kernels/matrix_solve_op.cc
index 90ec43721ed..32f3bd32c1a 100644
--- a/tensorflow/core/kernels/matrix_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_solve_op.cc
@@ -14,69 +14,59 @@ limitations under the License.
 ==============================================================================*/
 
 // See docs in ../ops/linalg_ops.cc.
-// TODO(rmlarsen): Add optional hint params so the caller can promise that the
-// matrices are invertible, symmetric (maybe detect automatically?), and
-// positive definite, which will allow us to call progressively faster solvers
-// internally.
-#include <cmath>
 
+#include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/Eigen/LU"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/binary_linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-template <class Scalar, bool SupportsBatchOperationT>
-class MatrixSolveOp
-    : public BinaryLinearAlgebraOp<Scalar, SupportsBatchOperationT> {
+template <class Scalar, bool SupportsBatchOperation>
+class MatrixSolveOp : public LinearAlgebraOp<Scalar, SupportsBatchOperation> {
  public:
-  explicit MatrixSolveOp(OpKernelConstruction* context)
-      : BinaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {
+  typedef LinearAlgebraOp<Scalar, SupportsBatchOperation> Base;
+
+  explicit MatrixSolveOp(OpKernelConstruction* context) : Base(context) {
     OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
   }
-  ~MatrixSolveOp() override {}
 
-  TensorShape GetOutputMatrixShape(
-      const TensorShape& input_matrix_shape,
-      const TensorShape& rhs_matrix_shape) override {
-    CHECK_EQ(input_matrix_shape.dims(), rhs_matrix_shape.dims());
-    TensorShape output_matrix_shape = input_matrix_shape;
-    output_matrix_shape.set_dim(
-        output_matrix_shape.dims() - 1,
-        rhs_matrix_shape.dim_size(output_matrix_shape.dims() - 1));
-    return output_matrix_shape;
+  using TensorShapes = typename Base::TensorShapes;
+  using Matrix = typename Base::Matrix;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    Base::ValidateSquareSolver(context, input_matrix_shapes);
   }
 
-  int64 GetCostPerUnit(const TensorShape& input_matrix_shape,
-                       const TensorShape& rhs_matrix_shape) override {
-    const int64 rows = input_matrix_shape.dim_size(0);
-    const int64 rhss = rhs_matrix_shape.dim_size(1);
-    if (rows > (1LL << 20)) {
-      // A big number to cap the cost in case overflow.
-      return kint32max;
-    } else {
-      return rows * rows * (rows + rhss);
-    }
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    return TensorShapes({TensorShape({input_matrix_shapes[0].dim_size(1),
+                                      input_matrix_shapes[1].dim_size(1)})});
   }
 
-  using typename BinaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>::Matrix;
-  using typename BinaryLinearAlgebraOp<Scalar,
-                                       SupportsBatchOperationT>::MatrixMap;
-  using typename BinaryLinearAlgebraOp<Scalar,
-                                       SupportsBatchOperationT>::ConstMatrixMap;
+  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    double rows = static_cast<double>(input_matrix_shapes[0].dim_size(0));
+    double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
+    double cost = rows * rows * (rows + num_rhss);
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
 
-  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& matrix,
-                     const ConstMatrixMap& rhs, MatrixMap* output) override {
-    OP_REQUIRES(context, matrix.rows() == matrix.cols(),
-                errors::InvalidArgument("Input matrix must be square."));
-    OP_REQUIRES(
-        context, matrix.cols() == rhs.rows(),
-        errors::InvalidArgument("Input matrix and rhs are incompatible."));
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const ConstMatrixMap& matrix = inputs[0];
+    const ConstMatrixMap& rhs = inputs[1];
     if (matrix.rows() == 0 || rhs.cols() == 0) {
       // To be consistent with the MatrixInverse op, we define the solution for
       // an empty set of equation as the empty matrix.
@@ -106,7 +96,7 @@ class MatrixSolveOp
     // The necessary changes to Eigen are in
     // https://bitbucket.org/eigen/eigen/pull-requests/174/ \
     // add-matrix-condition-number-estimation/diff
-    *output = lu_decomposition.solve(rhs);
+    outputs->at(0) = lu_decomposition.solve(rhs);
   }
 
  private:
@@ -115,12 +105,9 @@ class MatrixSolveOp
   TF_DISALLOW_COPY_AND_ASSIGN(MatrixSolveOp);
 };
 
-REGISTER_BINARY_LINALG_OP("MatrixSolve", (MatrixSolveOp<float, false>), float);
-REGISTER_BINARY_LINALG_OP("MatrixSolve", (MatrixSolveOp<double, false>),
-                          double);
-REGISTER_BINARY_LINALG_OP("BatchMatrixSolve", (MatrixSolveOp<float, true>),
-                          float);
-REGISTER_BINARY_LINALG_OP("BatchMatrixSolve", (MatrixSolveOp<double, true>),
-                          double);
+REGISTER_LINALG_OP("MatrixSolve", (MatrixSolveOp<float, false>), float);
+REGISTER_LINALG_OP("MatrixSolve", (MatrixSolveOp<double, false>), double);
+REGISTER_LINALG_OP("BatchMatrixSolve", (MatrixSolveOp<float, true>), float);
+REGISTER_LINALG_OP("BatchMatrixSolve", (MatrixSolveOp<double, true>), double);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_triangular_solve_op.cc b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
index 9d20369cbbc..50cab2b84e2 100644
--- a/tensorflow/core/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
@@ -14,13 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 // See docs in ../ops/linalg_ops.cc.
-#include <cmath>
 
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/binary_linalg_ops_common.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -28,54 +27,51 @@ limitations under the License.
 
 namespace tensorflow {
 
-template <class Scalar, bool SupportsBatchOperationT>
+template <class Scalar, bool SupportsBatchOperation>
 class MatrixTriangularSolveOp
-    : public BinaryLinearAlgebraOp<Scalar, SupportsBatchOperationT> {
+    : public LinearAlgebraOp<Scalar, SupportsBatchOperation> {
  public:
+  typedef LinearAlgebraOp<Scalar, SupportsBatchOperation> Base;
+
   explicit MatrixTriangularSolveOp(OpKernelConstruction* context)
-      : BinaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>(context),
-        lower_(true),
-        adjoint_(false) {
+      : Base(context), lower_(true), adjoint_(false) {
     OP_REQUIRES_OK(context, context->GetAttr("lower", &lower_));
     OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
   }
-  ~MatrixTriangularSolveOp() override {}
 
-  TensorShape GetOutputMatrixShape(
-      const TensorShape& input_matrix_shape,
-      const TensorShape& rhs_matrix_shape) override {
-    CHECK_EQ(input_matrix_shape.dims(), rhs_matrix_shape.dims());
-    TensorShape output_matrix_shape = input_matrix_shape;
-    output_matrix_shape.set_dim(
-        output_matrix_shape.dims() - 1,
-        rhs_matrix_shape.dim_size(output_matrix_shape.dims() - 1));
-    return output_matrix_shape;
+  using TensorShapes = typename Base::TensorShapes;
+  using Matrix = typename Base::Matrix;
+  using MatrixMap = typename Base::MatrixMap;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  virtual void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    Base::ValidateSquareSolver(context, input_matrix_shapes);
   }
 
-  int64 GetCostPerUnit(const TensorShape& input_matrix_shape,
-                       const TensorShape& rhs_matrix_shape) override {
-    const int64 rows = input_matrix_shape.dim_size(0);
-    const int64 rhss = rhs_matrix_shape.dim_size(1);
-    if (rows > (1LL << 20)) {
-      // A big number to cap the cost in case overflow.
-      return kint32max;
-    } else {
-      return rows * rows * rhss;
-    }
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    return TensorShapes({TensorShape({input_matrix_shapes[0].dim_size(1),
+                                      input_matrix_shapes[1].dim_size(1)})});
   }
 
-  using typename BinaryLinearAlgebraOp<Scalar,
-                                       SupportsBatchOperationT>::MatrixMap;
-  using typename BinaryLinearAlgebraOp<Scalar,
-                                       SupportsBatchOperationT>::ConstMatrixMap;
+  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    double rows = static_cast<double>(input_matrix_shapes[0].dim_size(0));
+    double num_rhss = static_cast<double>(input_matrix_shapes[1].dim_size(1));
+    double cost = rows * rows * num_rhss;
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const ConstMatrixMap& matrix = inputs[0];
+    const ConstMatrixMap& rhs = inputs[1];
+    MatrixMap& output = outputs->at(0);
 
-  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& matrix,
-                     const ConstMatrixMap& rhs, MatrixMap* output) override {
-    OP_REQUIRES(context, matrix.rows() == matrix.cols(),
-                errors::InvalidArgument("Input matrix must be square."));
-    OP_REQUIRES(
-        context, matrix.cols() == rhs.rows(),
-        errors::InvalidArgument("Input matrix and rhs are incompatible."));
     if (matrix.rows() == 0 || rhs.cols() == 0) {
       // To be consistent with the MatrixInverse op, we define the solution for
       // an empty set of equation as the empty matrix.
@@ -87,16 +83,16 @@ class MatrixTriangularSolveOp
     if (lower_) {
       auto triangle = matrix.template triangularView<Eigen::Lower>();
       if (adjoint_) {
-        output->noalias() = triangle.adjoint().solve(rhs);
+        output.noalias() = triangle.adjoint().solve(rhs);
       } else {
-        output->noalias() = triangle.solve(rhs);
+        output.noalias() = triangle.solve(rhs);
       }
     } else {
       auto triangle = matrix.template triangularView<Eigen::Upper>();
       if (adjoint_) {
-        output->noalias() = triangle.adjoint().solve(rhs);
+        output.noalias() = triangle.adjoint().solve(rhs);
       } else {
-        output->noalias() = triangle.solve(rhs);
+        output.noalias() = triangle.solve(rhs);
       }
     }
   }
@@ -108,13 +104,13 @@ class MatrixTriangularSolveOp
   TF_DISALLOW_COPY_AND_ASSIGN(MatrixTriangularSolveOp);
 };
 
-REGISTER_BINARY_LINALG_OP("MatrixTriangularSolve",
-                          (MatrixTriangularSolveOp<float, false>), float);
-REGISTER_BINARY_LINALG_OP("MatrixTriangularSolve",
-                          (MatrixTriangularSolveOp<double, false>), double);
-REGISTER_BINARY_LINALG_OP("BatchMatrixTriangularSolve",
-                          (MatrixTriangularSolveOp<float, true>), float);
-REGISTER_BINARY_LINALG_OP("BatchMatrixTriangularSolve",
-                          (MatrixTriangularSolveOp<double, true>), double);
+REGISTER_LINALG_OP("MatrixTriangularSolve",
+                   (MatrixTriangularSolveOp<float, false>), float);
+REGISTER_LINALG_OP("MatrixTriangularSolve",
+                   (MatrixTriangularSolveOp<double, false>), double);
+REGISTER_LINALG_OP("BatchMatrixTriangularSolve",
+                   (MatrixTriangularSolveOp<float, true>), float);
+REGISTER_LINALG_OP("BatchMatrixTriangularSolve",
+                   (MatrixTriangularSolveOp<double, true>), double);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/one_hot_op.cc b/tensorflow/core/kernels/one_hot_op.cc
index 916d85df2bc..1dc1bf65b22 100644
--- a/tensorflow/core/kernels/one_hot_op.cc
+++ b/tensorflow/core/kernels/one_hot_op.cc
@@ -85,26 +85,28 @@ class OneHotOp : public OpKernel {
     Tensor* output;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output));
 
-    // prefix_dim_size == # of elements before the axis
-    // depth_v == # of elements per axis
-    // suffix_dim_size == # of elements after the axis
-    int64 prefix_dim_size = 1;
-    for (int i = 0; i < axis; ++i) {
-      prefix_dim_size *= indices_shape.dim_size(i);
+    if (output_shape.num_elements() > 0) {
+      // prefix_dim_size == # of elements before the axis
+      // depth_v == # of elements per axis
+      // suffix_dim_size == # of elements after the axis
+      int64 prefix_dim_size = 1;
+      for (int i = 0; i < axis; ++i) {
+        prefix_dim_size *= indices_shape.dim_size(i);
+      }
+      TI suffix_dim_size = indices_shape.num_elements() / prefix_dim_size;
+
+      // Split indices into matrix of size prefix_dim_size x suffix_dim_size
+      auto indices_t =
+          indices.shaped<TI, 2>({prefix_dim_size, suffix_dim_size});
+      // Split output into 3-Tensor of size:
+      //   prefix_dim_size x depth x suffix_dim_size.
+      auto output_t =
+          output->shaped<T, 3>({prefix_dim_size, depth_v, suffix_dim_size});
+
+      functor::OneHot<Device, T, TI>::Compute(ctx->eigen_device<Device>(),
+                                              indices_t, on_value_t,
+                                              off_value_t, &output_t);
     }
-    TI suffix_dim_size =
-        indices_shape.num_elements() / prefix_dim_size;
-
-    // Split indices into matrix of size prefix_dim_size x suffix_dim_size
-    auto indices_t =
-        indices.shaped<TI, 2>({prefix_dim_size, suffix_dim_size});
-    // Split output into 3-Tensor of size:
-    //   prefix_dim_size x depth x suffix_dim_size.
-    auto output_t =
-        output->shaped<T, 3>({prefix_dim_size, depth_v, suffix_dim_size});
-
-    functor::OneHot<Device, T, TI>::Compute(ctx->eigen_device<Device>(), indices_t,
-                                        on_value_t, off_value_t, &output_t);
   }
 
  private:
@@ -113,12 +115,12 @@ class OneHotOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(OneHotOp);
 };
 
-#define REGISTER_ONE_HOT_INDEX(type, index_type)                  \
-  REGISTER_KERNEL_BUILDER(Name("OneHot")                          \
-                              .Device(DEVICE_CPU)                 \
-                              .TypeConstraint<index_type>("TI")   \
-                              .TypeConstraint<type>("T")          \
-                              .HostMemory("depth"),               \
+#define REGISTER_ONE_HOT_INDEX(type, index_type)                \
+  REGISTER_KERNEL_BUILDER(Name("OneHot")                        \
+                              .Device(DEVICE_CPU)               \
+                              .TypeConstraint<index_type>("TI") \
+                              .TypeConstraint<type>("T")        \
+                              .HostMemory("depth"),             \
                           OneHotOp<CPUDevice, type, index_type>);
 
 #define REGISTER_ONE_HOT(type)         \
@@ -132,13 +134,13 @@ TF_CALL_ALL_TYPES(REGISTER_ONE_HOT);
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC_INDEX(T, TI)                                       \
-  template <>                                                               \
-  void OneHot<GPUDevice, T, TI>::Compute(                                   \
-      const GPUDevice& d, const typename TTypes<TI>::ConstMatrix& indices,  \
-      const typename TTypes<T>::ConstScalar& on_value,                      \
-      const typename TTypes<T>::ConstScalar& off_value,                     \
-      typename TTypes<T, 3>::Tensor* output);                               \
+#define DECLARE_GPU_SPEC_INDEX(T, TI)                                      \
+  template <>                                                              \
+  void OneHot<GPUDevice, T, TI>::Compute(                                  \
+      const GPUDevice& d, const typename TTypes<TI>::ConstMatrix& indices, \
+      const typename TTypes<T>::ConstScalar& on_value,                     \
+      const typename TTypes<T>::ConstScalar& off_value,                    \
+      typename TTypes<T, 3>::Tensor* output);                              \
   extern template struct OneHot<GPUDevice, T, TI>;
 
 #define DECLARE_GPU_SPEC(T)         \
@@ -154,12 +156,12 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 }  // namespace functor
 
 // Registration of the GPU implementations.
-#define REGISTER_ONE_HOT_GPU_INDEX(type, index_type)              \
-  REGISTER_KERNEL_BUILDER(Name("OneHot")                          \
-                              .Device(DEVICE_GPU)                 \
-                              .TypeConstraint<index_type>("TI")   \
-                              .TypeConstraint<type>("T")          \
-                              .HostMemory("depth"),               \
+#define REGISTER_ONE_HOT_GPU_INDEX(type, index_type)            \
+  REGISTER_KERNEL_BUILDER(Name("OneHot")                        \
+                              .Device(DEVICE_GPU)               \
+                              .TypeConstraint<index_type>("TI") \
+                              .TypeConstraint<type>("T")        \
+                              .HostMemory("depth"),             \
                           OneHotOp<GPUDevice, type, index_type>);
 
 #define REGISTER_ONE_HOT_GPU(type)         \
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index 3d95338d11d..168e8ec1eda 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -111,6 +111,17 @@ class MaxPoolingOp : public OpKernel {
                                 0, params.forward_output_shape(), &output));
 
     if (params.depth_window > 1) {
+      // Validate spec against the current implementation.  A
+      // relaxation of these requirements would be ideal.
+      OP_REQUIRES(context, params.depth % params.depth_window == 0,
+                  errors::Unimplemented(
+                      "Depthwise max pooling requires "
+                      "the depth window to evenly divide the input depth."));
+      OP_REQUIRES(
+          context, params.depth_window == params.depth_stride,
+          errors::Unimplemented("Depthwise max pooling requires "
+                                "the depth window to equal the depth stride."));
+
       DepthwiseMaxPool(context, output, tensor_in, params);
     } else {
       SpatialMaxPool(context, output, tensor_in, params, padding_);
diff --git a/tensorflow/core/kernels/reader_ops.cc b/tensorflow/core/kernels/reader_ops.cc
index 1c7fbae81cd..bb8e35cc089 100644
--- a/tensorflow/core/kernels/reader_ops.cc
+++ b/tensorflow/core/kernels/reader_ops.cc
@@ -55,8 +55,9 @@ class ReaderVerbAsyncOpKernel : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
     ReaderInterface* reader;
-    OP_REQUIRES_OK(context,
-                   GetResourceFromContext(context, "reader_handle", &reader));
+    OP_REQUIRES_OK_ASYNC(
+        context, GetResourceFromContext(context, "reader_handle", &reader),
+        done);
     thread_pool_->Schedule([this, context, reader, done]() {
       ComputeWithReader(context, reader);
       reader->Unref();
diff --git a/tensorflow/core/kernels/self_adjoint_eig_op.cc b/tensorflow/core/kernels/self_adjoint_eig_op.cc
index ceaeb640f20..9d3a411f3b2 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_op.cc
+++ b/tensorflow/core/kernels/self_adjoint_eig_op.cc
@@ -15,11 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/linalg_ops.cc.
 
-#include <cmath>
-
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/Eigen/Eigenvalues"
-
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -28,41 +25,33 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
+
 namespace tensorflow {
 
-template <class Scalar, bool SupportsBatchOperationT>
+template <class Scalar, bool SupportsBatchOperation>
 class SelfAdjointEigOp
-    : public UnaryLinearAlgebraOp<Scalar, SupportsBatchOperationT> {
+    : public LinearAlgebraOp<Scalar, SupportsBatchOperation> {
  public:
-  explicit SelfAdjointEigOp(OpKernelConstruction* context)
-      : UnaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>(context) {}
+  typedef LinearAlgebraOp<Scalar, SupportsBatchOperation> Base;
 
-  TensorShape GetOutputMatrixShape(
-      const TensorShape& input_matrix_shape) override {
-    int64 d = input_matrix_shape.dim_size(0);
-    return TensorShape({d + 1, d});
+  explicit SelfAdjointEigOp(OpKernelConstruction* context) : Base(context) {}
+
+  using TensorShapes = typename Base::TensorShapes;
+  using Matrix = typename Base::Matrix;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    int64 d = input_matrix_shapes[0].dim_size(0);
+    return TensorShapes({TensorShape({d + 1, d})});
   }
 
-  int64 GetCostPerUnit(const TensorShape& input_matrix_shape) override {
-    const int64 rows = input_matrix_shape.dim_size(0);
-    if (rows > (1LL << 20)) {
-      // A big number to cap the cost in case overflow.
-      return kint64max;
-    } else {
-      return rows * rows * rows;
-    }
-  }
-
-  using
-      typename UnaryLinearAlgebraOp<Scalar, SupportsBatchOperationT>::MatrixMap;
-  using typename UnaryLinearAlgebraOp<Scalar,
-                                      SupportsBatchOperationT>::ConstMatrixMap;
-
-  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMap& input,
-                     MatrixMap* output) override {
-    OP_REQUIRES(context, input.rows() == input.cols(),
-                errors::InvalidArgument("Input matrix must be square."));
-    if (input.rows() == 0) {
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const int64 rows = inputs[0].rows();
+    if (rows == 0) {
       // If X is an empty matrix (0 rows, 0 col), X * X' == X.
       // Therefore, we return X.
       return;
@@ -70,13 +59,13 @@ class SelfAdjointEigOp
 
     Eigen::SelfAdjointEigenSolver<
         Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-        es(input);
-    output->row(0) = es.eigenvalues().transpose();
-    output->bottomRows(input.rows()) = es.eigenvectors();
+        es(inputs[0]);
     OP_REQUIRES(context, es.info() == Eigen::Success,
                 errors::InvalidArgument("Self Adjoint Eigen decomposition was"
                                         "not successful. "
                                         "The input might not be valid."));
+    outputs->at(0).row(0) = es.eigenvalues().transpose();
+    outputs->at(0).bottomRows(rows) = es.eigenvectors();
   }
 };
 
diff --git a/tensorflow/core/kernels/self_adjoint_eig_v2_op.cc b/tensorflow/core/kernels/self_adjoint_eig_v2_op.cc
new file mode 100644
index 00000000000..1b457ebe9ef
--- /dev/null
+++ b/tensorflow/core/kernels/self_adjoint_eig_v2_op.cc
@@ -0,0 +1,91 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/Eigen/Eigenvalues"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+template <class Scalar, bool SupportsBatchOperation>
+class SelfAdjointEigV2Op
+    : public LinearAlgebraOp<Scalar, SupportsBatchOperation> {
+ public:
+  typedef LinearAlgebraOp<Scalar, SupportsBatchOperation> Base;
+
+  explicit SelfAdjointEigV2Op(OpKernelConstruction* context) : Base(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("compute_v", &compute_v_));
+  }
+
+  using TensorShapes = typename Base::TensorShapes;
+  using Matrix = typename Base::Matrix;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    int64 n = input_matrix_shapes[0].dim_size(0);
+    if (compute_v_) {
+      return TensorShapes({TensorShape({n}), TensorShape({n, n})});
+    } else {
+      return TensorShapes({TensorShape({n})});
+    }
+  }
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const int64 rows = inputs[0].rows();
+    if (rows == 0) {
+      // If X is an empty matrix (0 rows, 0 col), X * X' == X.
+      // Therefore, we return X.
+      return;
+    }
+
+    Eigen::SelfAdjointEigenSolver<Matrix> eig(
+        inputs[0],
+        compute_v_ ? Eigen::ComputeEigenvectors : Eigen::EigenvaluesOnly);
+    OP_REQUIRES(
+        context, eig.info() == Eigen::Success,
+        errors::InvalidArgument("Self Adjoint Eigen decomposition was not "
+                                "successful. The input might not be valid."));
+
+    outputs->at(0) = eig.eigenvalues();
+    if (compute_v_) {
+      outputs->at(1) = eig.eigenvectors();
+    }
+  }
+
+ private:
+  bool compute_v_;
+};
+
+REGISTER_LINALG_OP("SelfAdjointEigV2", (SelfAdjointEigV2Op<float, false>),
+                   float);
+REGISTER_LINALG_OP("SelfAdjointEigV2", (SelfAdjointEigV2Op<double, false>),
+                   double);
+REGISTER_LINALG_OP("BatchSelfAdjointEigV2", (SelfAdjointEigV2Op<float, true>),
+                   float);
+REGISTER_LINALG_OP("BatchSelfAdjointEigV2", (SelfAdjointEigV2Op<double, true>),
+                   double);
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index 0acde9c498b..3cbd9691d18 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -118,16 +118,23 @@ class LinSpaceOp : public OpKernel {
   }
 };
 
-#define REGISTER_CPU_KERNEL(T)                        \
+#define REGISTER_KERNEL(DEV, T)                       \
   REGISTER_KERNEL_BUILDER(Name("LinSpace")            \
-                              .Device(DEVICE_CPU)     \
+                              .Device(DEV)            \
                               .TypeConstraint<T>("T") \
                               .HostMemory("start")    \
                               .HostMemory("stop")     \
                               .HostMemory("num")      \
                               .HostMemory("output"),  \
                           LinSpaceOp<T>);
+#define REGISTER_CPU_KERNEL(T) REGISTER_KERNEL(DEVICE_CPU, T)
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 
+// NOTE(touts): We register the op on GPU but it still runs on CPU
+// because its inputs and outputs are tagged as HostMemory.
+#define REGISTER_GPU_KERNEL(T) REGISTER_KERNEL(DEVICE_GPU, T)
+TF_CALL_float(REGISTER_GPU_KERNEL);
+TF_CALL_double(REGISTER_GPU_KERNEL);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index 0861fa99821..63ad0059d45 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -253,6 +253,8 @@ class ExpandDimsOp : public OpKernel {
                            " and output shape ", output_shape.DebugString()));
     }
   }
+
+  bool IsExpensive() override { return false; }
 };
 REGISTER_KERNEL_BUILDER(Name("ExpandDims").Device(DEVICE_CPU).HostMemory("dim"),
                         ExpandDimsOp);
@@ -342,6 +344,8 @@ class SqueezeOp : public OpKernel {
     }
   }
 
+  bool IsExpensive() override { return false; }
+
  private:
   std::unordered_set<int32> squeeze_dims_;
 };
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 2092c0486f5..e5c7308c565 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -405,14 +405,30 @@ class StridedSliceGradOp : public OpKernel {
         context, input_shape_tensor.dims() == 1,
         errors::InvalidArgument("shape must be 1-D, got shape.shape = ",
                                 input_shape_tensor.shape().DebugString()));
-    OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                input_shape_tensor.vec<int32>(), &input_shape));
+    if (input_shape_tensor.dtype() == DT_INT32) {
+      OP_REQUIRES_OK(
+          context, TensorShapeUtils::MakeShape(input_shape_tensor.vec<int32>(),
+                                               &input_shape));
+    } else if (input_shape_tensor.dtype() == DT_INT64) {
+      OP_REQUIRES_OK(
+          context, TensorShapeUtils::MakeShape(input_shape_tensor.vec<int64>(),
+                                               &input_shape));
+    } else {
+      LOG(FATAL) << "shape must have type int32 or int64.";
+    }
 
     SharedValidation(context, input_shape, begin_mask, end_mask, ellipsis_mask,
                      new_axis_mask, shrink_axis_mask, &processing_shape,
                      &final_shape, &is_identity, &is_simple_slice, &slice_dim0,
                      &begin, &end, &strides);
 
+    // Check to make sure dy is consistent with the original slice
+    TensorShape dy_shape = context->input(4).shape();
+    OP_REQUIRES(
+        context, final_shape == dy_shape,
+        errors::InvalidArgument("shape of dy was ", dy_shape.DebugString(),
+                                " instead of ", final_shape.DebugString()));
+
     if (!context->status().ok()) return;
 
     // const int input_dims = input.dims();
@@ -454,6 +470,7 @@ class StridedSliceGradOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")         \
                               .Device(DEVICE_CPU)          \
                               .TypeConstraint<type>("T")   \
+                              .HostMemory("shape")         \
                               .HostMemory("begin")         \
                               .HostMemory("end")           \
                               .HostMemory("strides"),      \
@@ -478,6 +495,7 @@ REGISTER_STRIDED_SLICE(bfloat16);
   REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")             \
                               .Device(DEVICE_GPU)              \
                               .TypeConstraint<type>("T")       \
+                              .HostMemory("shape")             \
                               .HostMemory("begin")             \
                               .HostMemory("end")               \
                               .HostMemory("strides")           \
diff --git a/tensorflow/core/kernels/svd_op.cc b/tensorflow/core/kernels/svd_op.cc
new file mode 100644
index 00000000000..c3686947dda
--- /dev/null
+++ b/tensorflow/core/kernels/svd_op.cc
@@ -0,0 +1,105 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+#include <algorithm>
+
+#include "third_party/eigen3/Eigen/SVD"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+template <class Scalar, bool SupportsBatchOperation>
+class SvdOp : public LinearAlgebraOp<Scalar, SupportsBatchOperation> {
+ public:
+  typedef LinearAlgebraOp<Scalar, SupportsBatchOperation> Base;
+
+  explicit SvdOp(OpKernelConstruction* context) : Base(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("compute_uv", &compute_uv_));
+    OP_REQUIRES_OK(context, context->GetAttr("full_matrices", &full_matrices_));
+  }
+
+  using TensorShapes = typename Base::TensorShapes;
+
+  void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    Base::ValidateSingleMatrix(context, input_matrix_shapes);
+  }
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    int64 m = input_matrix_shapes[0].dim_size(0);
+    int64 n = input_matrix_shapes[0].dim_size(1);
+    int64 min_size = std::min(m, n);
+    if (compute_uv_) {
+      return TensorShapes({TensorShape({min_size}),
+                           TensorShape({m, full_matrices_ ? m : min_size}),
+                           TensorShape({n, full_matrices_ ? n : min_size})});
+    } else {
+      return TensorShapes({TensorShape({min_size})});
+    }
+  }
+
+  // TODO(rmlarsen): This should depend on compute_uv. See b/30409375.
+  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    double m = static_cast<double>(input_matrix_shapes[0].dim_size(0));
+    double n = static_cast<double>(input_matrix_shapes[0].dim_size(1));
+    double cost = 12 * std::max(m, n) * std::min(m, n) * std::min(m, n);
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
+
+  using Matrix = typename Base::Matrix;
+  using MatrixMaps = typename Base::MatrixMaps;
+  using ConstMatrixMap = typename Base::ConstMatrixMap;
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    Eigen::JacobiSVD<Matrix, Eigen::HouseholderQRPreconditioner> svd;
+    if (compute_uv_) {
+      svd.compute(inputs[0],
+                  (full_matrices_ ? Eigen::ComputeFullU | Eigen::ComputeFullV
+                                  : Eigen::ComputeThinU | Eigen::ComputeThinV));
+      outputs->at(0) = svd.singularValues();
+      outputs->at(1) = svd.matrixU();
+      outputs->at(2) = svd.matrixV();
+    } else {
+      svd.compute(inputs[0]);
+      outputs->at(0) = svd.singularValues();
+    }
+  }
+
+ private:
+  bool compute_uv_;
+  bool full_matrices_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SvdOp);
+};
+
+REGISTER_LINALG_OP("Svd", (SvdOp<float, false>), float);
+REGISTER_LINALG_OP("Svd", (SvdOp<double, false>), double);
+REGISTER_LINALG_OP("BatchSvd", (SvdOp<float, true>), float);
+REGISTER_LINALG_OP("BatchSvd", (SvdOp<double, true>), double);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 59225049fa7..c3704da0b12 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -29,11 +29,14 @@ namespace thread {
 
 struct EigenEnvironment {
   typedef Thread EnvThread;
-  struct Task {
+  struct TaskImpl {
     std::function<void()> f;
     Context context;
     uint64 trace_id;
   };
+  struct Task {
+    std::unique_ptr<TaskImpl> f;
+  };
 
   Env* const env_;
   const ThreadOptions thread_options_;
@@ -58,17 +61,21 @@ struct EigenEnvironment {
       port::Tracing::RecordEvent(port::Tracing::EventCategory::kScheduleClosure,
                                  id);
     }
-    return Task{std::move(f), Context(), id};
+    return Task{
+        std::unique_ptr<TaskImpl>(new TaskImpl{
+            std::move(f), Context(ContextKind::kThread), id,
+        }),
+    };
   }
 
   void ExecuteTask(const Task& t) {
-    WithContext wc(t.context);
-    if (t.trace_id != 0) {
+    WithContext wc(t.f->context);
+    if (t.f->trace_id != 0) {
       port::Tracing::ScopedActivity region(
-          port::Tracing::EventCategory::kRunClosure, t.trace_id);
-      t.f();
+          port::Tracing::EventCategory::kRunClosure, t.f->trace_id);
+      t.f->f();
     } else {
-      t.f();
+      t.f->f();
     }
   }
 };
diff --git a/tensorflow/core/lib/monitoring/counter.h b/tensorflow/core/lib/monitoring/counter.h
index af76884012d..0fcbe90ea89 100644
--- a/tensorflow/core/lib/monitoring/counter.h
+++ b/tensorflow/core/lib/monitoring/counter.h
@@ -20,6 +20,9 @@ limitations under the License.
 #include <atomic>
 #include <map>
 
+#include "tensorflow/core/lib/monitoring/export_registry.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -71,8 +74,14 @@ class CounterCell {
 template <int NumLabels>
 class Counter {
  public:
-  Counter() {}
-  ~Counter() {}
+  ~Counter() {
+    // Deleted here, before the metric_def is destroyed.
+    registration_handle_.reset();
+  }
+
+  // Creates the metric based on the metric-definition.
+  static Counter* New(
+      const MetricDef<MetricKind::CUMULATIVE, int64, NumLabels>& metric_def);
 
   // Retrieves the cell for the specified labels, creating it on demand if
   // not already present.
@@ -80,8 +89,20 @@ class Counter {
   CounterCell* GetCell(const Labels&... labels) LOCKS_EXCLUDED(mu_);
 
  private:
+  explicit Counter(
+      const MetricDef<MetricKind::CUMULATIVE, int64, NumLabels>& metric_def)
+      : metric_def_(metric_def),
+        registration_handle_(
+            ExportRegistry::Default()->Register(&metric_def_)) {}
+
   mutable mutex mu_;
 
+  // The metric definition. This will be used to identify the metric when we
+  // register it for exporting.
+  const MetricDef<MetricKind::CUMULATIVE, int64, NumLabels> metric_def_;
+
+  std::unique_ptr<ExportRegistry::RegistrationHandle> registration_handle_;
+
   using LabelArray = std::array<string, NumLabels>;
   std::map<LabelArray, CounterCell> cells_ GUARDED_BY(mu_);
 
@@ -92,6 +113,19 @@ class Counter {
 //  Implementation details follow. API readers may skip.
 ////
 
+template <int NumLabels>
+Counter<NumLabels>* Counter<NumLabels>::New(
+    const MetricDef<MetricKind::CUMULATIVE, int64, NumLabels>& metric_def) {
+  return new Counter<NumLabels>(metric_def);
+}
+
+inline void CounterCell::IncrementBy(const int64 step) {
+  DCHECK_LE(0, step) << "Must not decrement cumulative metrics.";
+  value_ += step;
+}
+
+inline int64 CounterCell::value() const { return value_; }
+
 template <int NumLabels>
 template <typename... Labels>
 CounterCell* Counter<NumLabels>::GetCell(const Labels&... labels)
diff --git a/tensorflow/core/lib/monitoring/counter_test.cc b/tensorflow/core/lib/monitoring/counter_test.cc
index 0010662e263..2bf361a534a 100644
--- a/tensorflow/core/lib/monitoring/counter_test.cc
+++ b/tensorflow/core/lib/monitoring/counter_test.cc
@@ -19,26 +19,24 @@ limitations under the License.
 
 namespace tensorflow {
 namespace monitoring {
+namespace {
 
-class LabeledCounterTest : public ::testing::Test {
- protected:
-  LabeledCounterTest() {}
+auto* counter_with_labels =
+    Counter<1>::New({"/tensorflow/test/counter_with_labels",
+                     "Counter with one label.", "One label"});
 
-  Counter<1> counter_;
-};
-
-TEST_F(LabeledCounterTest, InitializedWithZero) {
-  EXPECT_EQ(0, counter_.GetCell("Empty")->value());
+TEST(LabeledCounterTest, InitializedWithZero) {
+  EXPECT_EQ(0, counter_with_labels->GetCell("Empty")->value());
 }
 
-TEST_F(LabeledCounterTest, GetCell) {
-  auto* cell = counter_.GetCell("GetCellOp");
+TEST(LabeledCounterTest, GetCell) {
+  auto* cell = counter_with_labels->GetCell("GetCellOp");
   EXPECT_EQ(0, cell->value());
 
   cell->IncrementBy(42);
   EXPECT_EQ(42, cell->value());
 
-  auto* same_cell = counter_.GetCell("GetCellOp");
+  auto* same_cell = counter_with_labels->GetCell("GetCellOp");
   EXPECT_EQ(42, same_cell->value());
 
   same_cell->IncrementBy(58);
@@ -46,32 +44,31 @@ TEST_F(LabeledCounterTest, GetCell) {
   EXPECT_EQ(100, same_cell->value());
 }
 
-using LabeledCounterDeathTest = LabeledCounterTest;
-
-TEST_F(LabeledCounterDeathTest, DiesOnDecrement) {
-  EXPECT_DEBUG_DEATH({ counter_.GetCell("DyingOp")->IncrementBy(-1); },
-                     "decrement");
+TEST(LabeledCounterDeathTest, DiesOnDecrement) {
+  EXPECT_DEBUG_DEATH(
+      { counter_with_labels->GetCell("DyingOp")->IncrementBy(-1); },
+      "decrement");
 }
 
-class UnlabeledCounterTest : public ::testing::Test {
- protected:
-  UnlabeledCounterTest() {}
+auto* init_counter_without_labels = Counter<0>::New(
+    {"/tensorflow/test/init_counter_without_labels",
+     "Counter without any labels to check if it is initialized as 0."});
 
-  Counter<0> counter_;
-};
-
-TEST_F(UnlabeledCounterTest, InitializedWithZero) {
-  EXPECT_EQ(0, counter_.GetCell()->value());
+TEST(UnlabeledCounterTest, InitializedWithZero) {
+  EXPECT_EQ(0, init_counter_without_labels->GetCell()->value());
 }
 
-TEST_F(UnlabeledCounterTest, GetCell) {
-  auto* cell = counter_.GetCell();
+auto* counter_without_labels = Counter<0>::New(
+    {"/tensorflow/test/counter_without_labels", "Counter without any labels."});
+
+TEST(UnlabeledCounterTest, GetCell) {
+  auto* cell = counter_without_labels->GetCell();
   EXPECT_EQ(0, cell->value());
 
   cell->IncrementBy(42);
   EXPECT_EQ(42, cell->value());
 
-  auto* same_cell = counter_.GetCell();
+  auto* same_cell = counter_without_labels->GetCell();
   EXPECT_EQ(42, same_cell->value());
 
   same_cell->IncrementBy(58);
@@ -79,11 +76,16 @@ TEST_F(UnlabeledCounterTest, GetCell) {
   EXPECT_EQ(100, same_cell->value());
 }
 
-using UnlabeledCounterDeathTest = UnlabeledCounterTest;
+auto* dead_counter_without_labels = Counter<0>::New(
+    {"/tensorflow/test/dead_counter_without_labels",
+     "Counter without any labels which goes on to die on decrement."});
 
-TEST_F(UnlabeledCounterDeathTest, DiesOnDecrement) {
-  EXPECT_DEBUG_DEATH({ counter_.GetCell()->IncrementBy(-1); }, "decrement");
+TEST(UnlabeledCounterDeathTest, DiesOnDecrement) {
+  EXPECT_DEBUG_DEATH(
+      { dead_counter_without_labels->GetCell()->IncrementBy(-1); },
+      "decrement");
 }
 
+}  // namespace
 }  // namespace monitoring
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/monitoring/export_registry.cc b/tensorflow/core/lib/monitoring/export_registry.cc
new file mode 100644
index 00000000000..4c0eed668a4
--- /dev/null
+++ b/tensorflow/core/lib/monitoring/export_registry.cc
@@ -0,0 +1,53 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/monitoring/export_registry.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace monitoring {
+
+ExportRegistry* ExportRegistry::Default() {
+  static ExportRegistry* default_registry = new ExportRegistry();
+  return default_registry;
+}
+
+std::unique_ptr<ExportRegistry::RegistrationHandle> ExportRegistry::Register(
+    const AbstractMetricDef* const metric_def) {
+  mutex_lock l(mu_);
+
+  LOG(INFO) << "Here." << registry_.size();
+  const auto found_it = registry_.find(metric_def->name());
+  if (found_it != registry_.end()) {
+    LOG(INFO) << "Here2";
+    LOG(FATAL) << "Cannot register 2 metrics with the same name: "
+               << metric_def->name();
+  }
+  LOG(INFO) << "Here3";
+  registry_.insert({metric_def->name(), metric_def});
+  LOG(INFO) << "Here4." << registry_.size();
+
+  return std::unique_ptr<RegistrationHandle>(
+      new RegistrationHandle(this, metric_def));
+}
+
+void ExportRegistry::Unregister(const AbstractMetricDef* const metric_def) {
+  mutex_lock l(mu_);
+  registry_.erase(metric_def->name());
+}
+
+}  // namespace monitoring
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/monitoring/export_registry.h b/tensorflow/core/lib/monitoring/export_registry.h
new file mode 100644
index 00000000000..aca47735718
--- /dev/null
+++ b/tensorflow/core/lib/monitoring/export_registry.h
@@ -0,0 +1,88 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_EXPORT_REGISTRY_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_EXPORT_REGISTRY_H_
+
+#include <map>
+#include <memory>
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace monitoring {
+
+// An export registry for metrics.
+//
+// Metrics are registered here so that their state can be exported later using
+// an exporter.
+//
+// This class is thread-safe.
+class ExportRegistry {
+ public:
+  ~ExportRegistry() = default;
+
+  // Returns the default registry for the process.
+  //
+  // This registry belongs to this library and should never be deleted.
+  static ExportRegistry* Default();
+
+  // Registers the metric and returns a Registration object. The destruction of
+  // the registration object would cause the metric to be unregistered from this
+  // registry.
+  //
+  // IMPORTANT: Delete the handle before the metric-def is deleted.
+  class RegistrationHandle;
+  std::unique_ptr<RegistrationHandle> Register(
+      const AbstractMetricDef* metric_def)
+      LOCKS_EXCLUDED(mu_) TF_MUST_USE_RESULT;
+
+ private:
+  ExportRegistry() = default;
+
+  // Unregisters the metric from this registry. This is private because the
+  // public interface provides a Registration handle which automatically calls
+  // this upon destruction.
+  void Unregister(const AbstractMetricDef* metric_def) LOCKS_EXCLUDED(mu_);
+
+  mutable mutex mu_;
+  std::map<StringPiece, const AbstractMetricDef*> registry_ GUARDED_BY(mu_);
+};
+
+////
+// Implementation details follow. API readers may skip.
+////
+
+class ExportRegistry::RegistrationHandle {
+ public:
+  RegistrationHandle(ExportRegistry* const export_registry,
+                     const AbstractMetricDef* const metric_def)
+      : export_registry_(export_registry), metric_def_(metric_def) {}
+
+  ~RegistrationHandle() { export_registry_->Unregister(metric_def_); }
+
+ private:
+  ExportRegistry* const export_registry_;
+  const AbstractMetricDef* const metric_def_;
+};
+
+}  // namespace monitoring
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_EXPORT_REGISTRY_H_
diff --git a/tensorflow/core/lib/monitoring/export_registry_test.cc b/tensorflow/core/lib/monitoring/export_registry_test.cc
new file mode 100644
index 00000000000..a7cb0e8e52e
--- /dev/null
+++ b/tensorflow/core/lib/monitoring/export_registry_test.cc
@@ -0,0 +1,60 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/monitoring/export_registry.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace monitoring {
+namespace {
+
+TEST(ExportRegistryTest, RegistrationUnregistration) {
+  auto* export_registry = ExportRegistry::Default();
+  const MetricDef<MetricKind::CUMULATIVE, int64, 0> metric_def0(
+      "/tensorflow/metric0", "An example metric with no labels.");
+  const MetricDef<MetricKind::GAUGE, double, 1> metric_def1(
+      "/tensorflow/metric1", "An example metric with one label.", "LabelName");
+
+  {
+    // Enclosed in a scope so that we unregister before the stack variables
+    // above are destroyed.
+
+    std::unique_ptr<ExportRegistry::RegistrationHandle> handle0 =
+        export_registry->Register(&metric_def0);
+    std::unique_ptr<ExportRegistry::RegistrationHandle> handle1 =
+        export_registry->Register(&metric_def1);
+
+    handle0.reset();
+
+    // Able to register again because it was unregistered earlier.
+    handle0 = export_registry->Register(&metric_def0);
+  }
+}
+
+TEST(ExportRegistryDeathTest, DuplicateRegistration) {
+  auto* export_registry = ExportRegistry::Default();
+  const MetricDef<MetricKind::CUMULATIVE, int64, 0> metric_def(
+      "/tensorflow/metric", "An example metric with no labels.");
+
+  auto handle = export_registry->Register(&metric_def);
+  EXPECT_DEATH(
+      { auto duplicate_handle = export_registry->Register(&metric_def); },
+      "/tensorflow/metric");
+}
+
+}  // namespace
+}  // namespace monitoring
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/monitoring/metric_def.h b/tensorflow/core/lib/monitoring/metric_def.h
new file mode 100644
index 00000000000..01210e370ad
--- /dev/null
+++ b/tensorflow/core/lib/monitoring/metric_def.h
@@ -0,0 +1,128 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
+
+#include <array>
+#include <vector>
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+namespace monitoring {
+
+// Everything in the internal namespace is implementation details. Do not depend
+// on this.
+namespace internal {
+
+// Ensures that the string is a compile-time string literal.
+class StringLiteral {
+ public:
+  // We allow implicit conversions here on purpose.
+  template <int N>
+  StringLiteral(const char (&data)[N]) : literal_(data, N - 1) {}
+
+  // This ctor will be called for non-literals, causing compile-time failure.
+  template <typename NotStringLiteral>
+  StringLiteral(const NotStringLiteral& not_string_literal) = delete;
+
+  // Implicit conversion to StringPiece.
+  operator StringPiece() const { return literal_; }
+
+ private:
+  const StringPiece literal_;
+};
+
+}  // namespace internal
+
+// The different metric kinds available.
+//
+// Gauge indicates that the metric's values are instantaneous measurements of a
+// (typically) continuously varying quantity. Examples: a process's current heap
+// size, a queue's current length.
+//
+// Cumulative indicates that the metric's values represent non-negative changes
+// over specified time periods. Example: the number of rpc calls to a service.
+enum MetricKind { GAUGE, CUMULATIVE };
+
+// Abstract base class for a metric definition.
+//
+// Unlike MetricDef, this class is non-templatized and allows storing and
+// accessing metric definitions without the full type information.
+//
+// Everything except the value type of a metric is stored here. Please read
+// MetricDef class comments for more details.
+class AbstractMetricDef {
+ public:
+  MetricKind kind() const { return kind_; }
+
+  StringPiece name() const { return name_; }
+
+  StringPiece description() const { return description_; }
+
+  const std::vector<StringPiece> label_descriptions() const {
+    return label_descriptions_;
+  }
+
+ private:
+  template <MetricKind kind, typename Value, int NumLabels>
+  friend class MetricDef;
+
+  AbstractMetricDef(
+      const MetricKind kind, const internal::StringLiteral name,
+      const internal::StringLiteral description,
+      const std::vector<internal::StringLiteral>& label_descriptions)
+      : kind_(kind),
+        name_(name),
+        description_(description),
+        label_descriptions_(
+            {label_descriptions.begin(), label_descriptions.end()}) {}
+
+  const MetricKind kind_;
+  const StringPiece name_;
+  const StringPiece description_;
+  const std::vector<StringPiece> label_descriptions_;
+};
+
+// Metric definition.
+//
+// A metric is defined by its kind, value-type, name, description and the
+// description of its labels.
+//
+// NOTE: We allow only string literals for the name, description and label
+// descriptions because these should be fixed at compile-time and shouldn't be
+// dynamic.
+template <MetricKind metric_kind, typename Value, int NumLabels>
+class MetricDef : public AbstractMetricDef {
+ public:
+  using value_type = Value;
+
+  template <typename... LabelDesc>
+  MetricDef(const internal::StringLiteral name,
+            const internal::StringLiteral description,
+            const LabelDesc&... label_descriptions)
+      : AbstractMetricDef(metric_kind, name, description,
+                          {label_descriptions...}) {
+    static_assert(sizeof...(LabelDesc) == NumLabels,
+                  "Mismatch between Counter<NumLabels> and number of label "
+                  "descriptions.");
+  }
+};
+
+}  // namespace monitoring
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
diff --git a/tensorflow/core/lib/monitoring/metric_def_test.cc b/tensorflow/core/lib/monitoring/metric_def_test.cc
new file mode 100644
index 00000000000..5d371cca1b1
--- /dev/null
+++ b/tensorflow/core/lib/monitoring/metric_def_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace monitoring {
+namespace {
+
+TEST(MetricDefTest, Simple) {
+  const MetricDef<MetricKind::CUMULATIVE, int64, 0> metric_def0(
+      "/tensorflow/metric0", "An example metric with no labels.");
+  const MetricDef<MetricKind::GAUGE, double, 1> metric_def1(
+      "/tensorflow/metric1", "An example metric with one label.", "LabelName");
+
+  EXPECT_EQ("/tensorflow/metric0", metric_def0.name());
+  EXPECT_EQ("/tensorflow/metric1", metric_def1.name());
+
+  EXPECT_EQ(MetricKind::CUMULATIVE, metric_def0.kind());
+  EXPECT_EQ(MetricKind::GAUGE, metric_def1.kind());
+
+  EXPECT_EQ("An example metric with no labels.", metric_def0.description());
+  EXPECT_EQ("An example metric with one label.", metric_def1.description());
+
+  EXPECT_EQ(0, metric_def0.label_descriptions().size());
+  ASSERT_EQ(1, metric_def1.label_descriptions().size());
+  EXPECT_EQ("LabelName", metric_def1.label_descriptions()[0]);
+}
+
+}  // namespace
+}  // namespace monitoring
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/array_grad.cc b/tensorflow/core/ops/array_grad.cc
index d3ffb907bcb..fe2be71136a 100644
--- a/tensorflow/core/ops/array_grad.cc
+++ b/tensorflow/core/ops/array_grad.cc
@@ -406,4 +406,46 @@ Status StridedSliceGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("StridedSlice", StridedSliceGrad);
 
+Status StridedSliceGradGrad(const AttrSlice& attrs, FunctionDef* g) {
+  DataType itype;
+  TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "Index", &itype));
+  if (itype != DT_INT32) {
+    return errors::Unimplemented(
+        "SliceGrad for int64 index are not supported.");
+  }
+
+  // TODO(aselle): Shouldn't the int32 tensors return zeros of shape like
+  // dy_grad?
+  // I'm following slice's behavior for now.
+  *g = FDH::Define(
+      // Arg defs
+      {"shape: int32", "begin: int32", "end: int32", "stride: int32", "dy: T",
+       "grad: T"},
+      // Ret val defs
+      {"shape_grad: int32", "begin_grad: int32", "end_grad: int32",
+       "stride_grad: int32", "dy_grad: T"},
+      // Attr defs
+      {"T: type", "Index: {int32, int64}", "begin_mask: int", "end_mask: int",
+       "ellipsis_mask: int", "new_axis_mask: int", "shrink_axis_mask: int"},
+      {// Nodes
+       {{{"shape_grad"}, "ZerosLike", {"shape"}, {{"T", DT_INT32}}},
+        {{"begin_grad"}, "ZerosLike", {"begin"}, {{"T", DT_INT32}}},
+        {{"end_grad"}, "ZerosLike", {"end"}, {{"T", DT_INT32}}},
+        {{"stride_grad"}, "ZerosLike", {"stride"}, {{"T", DT_INT32}}},
+        {{"dy_grad"},
+         "StridedSlice",
+         {"grad", "begin", "end", "stride"},
+         {{"T", "$T"},
+          {"Index", "$Index"},
+          {"begin_mask", "$begin_mask"},
+          {"end_mask", "$end_mask"},
+          {"ellipsis_mask", "$ellipsis_mask"},
+          {"new_axis_mask", "$new_axis_mask"},
+          {"shrink_axis_mask", "$shrink_axis_mask"}}}}});
+
+  VLOG(1) << "StridedSliceGrad " << DebugString(*g);
+  return Status::OK();
+}
+REGISTER_OP_GRADIENT("StridedSliceGrad", StridedSliceGradGrad);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/array_grad_test.cc b/tensorflow/core/ops/array_grad_test.cc
index a051e456e5c..73bfebc1032 100644
--- a/tensorflow/core/ops/array_grad_test.cc
+++ b/tensorflow/core/ops/array_grad_test.cc
@@ -478,9 +478,57 @@ std::vector<Tensor> StridedSliceGrad(const Tensor& x, const Tensor& begin,
   return out;
 }
 
+std::vector<Tensor> StridedSliceGradGrad(
+    const Tensor& shape, const Tensor& begin, const Tensor& end,
+    const Tensor& strides, const Tensor& dy, const Tensor& grad,
+    int32 begin_mask, int32 end_mask, int32 ellipsis_mask, int32 new_axis_mask,
+    int32 shrink_axis_mask) {
+  auto T = DT_FLOAT;
+  auto gdef = test::function::GDef(
+      {f::NDef("shape", "Placeholder", {}, {{"dtype", DT_INT32}}),
+       f::NDef("begin", "Placeholder", {}, {{"dtype", DT_INT32}}),
+       f::NDef("end", "Placeholder", {}, {{"dtype", DT_INT32}}),
+       f::NDef("strides", "Placeholder", {}, {{"dtype", DT_INT32}}),
+       f::NDef("dy", "Placeholder", {}, {{"dtype", T}}),
+       f::NDef("grad", "Placeholder", {}, {{"dtype", T}}),
+       f::NDef(
+           "dx", "SymbolicGradient",
+           {"shape", "begin", "end", "strides", "dy", "grad"},
+           {{"f", FDH::FunctionRef("StridedSliceGrad",
+                                   {
+                                       {"T", T},
+                                       {"Index", DT_INT32},
+                                       {"begin_mask", begin_mask},
+                                       {"end_mask", end_mask},
+                                       {"new_axis_mask", new_axis_mask},
+                                       {"shrink_axis_mask", shrink_axis_mask},
+                                       {"ellipsis_mask", ellipsis_mask},
+                                   })},
+            {"Tin",
+             DataTypeSlice{DT_INT32, DT_INT32, DT_INT32, DT_INT32, T, T}},
+            {"Tout",
+             DataTypeSlice{DT_INT32, DT_INT32, DT_INT32, DT_INT32, T}}})});
+  VLOG(1) << DebugStringWhole(gdef);
+  auto sess = NewSession();
+  TF_CHECK_OK(sess->Create(gdef));
+  std::vector<Tensor> out;
+  TF_CHECK_OK(sess->Run({{"shape:0", shape},
+                         {"begin:0", begin},
+                         {"end:0", end},
+                         {"strides:0", strides},
+                         {"dy:0", dy},
+                         {"grad:0", grad}},
+                        {"dx:0", "dx:1", "dx:2", "dx:3", "dx:4"}, {}, &out));
+  CHECK_EQ(out.size(), 5);
+  TF_CHECK_OK(sess->Close());
+  delete sess;
+  return out;
+}
+
 TEST_F(ArrayGradTest, StridedSliceGrad) {
   Tensor x(DT_FLOAT, {2, 3, 4});
   x.flat<float>().setZero();
+  Tensor x_shape = test::AsTensor<int32>({2, 3, 4}, {3});
 
   {
     auto start = test::AsTensor<int32>({1, 1, 1});
@@ -502,6 +550,10 @@ TEST_F(ArrayGradTest, StridedSliceGrad) {
                           {2, 3, 4}));
     test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0}));
     test::ExpectTensorEqual<int32>(dx[2], test::AsTensor<int32>({0, 0, 0}));
+    auto ddx = StridedSliceGradGrad(x_shape, start, stop, strides, dy, dx[0],
+                                    begin_mask, end_mask, ellipsis_mask,
+                                    new_axis_mask, shrink_axis_mask);
+    test::ExpectClose(ddx[4], dy);
   }
 
   // test equivalent of python tf.gradients(foo[1:2, 1:3, 1:3])
@@ -525,6 +577,10 @@ TEST_F(ArrayGradTest, StridedSliceGrad) {
                           {2, 3, 4}));
     test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0}));
     test::ExpectTensorEqual<int32>(dx[2], test::AsTensor<int32>({0, 0, 0}));
+    auto ddx = StridedSliceGradGrad(x_shape, start, stop, strides, dy, dx[0],
+                                    begin_mask, end_mask, ellipsis_mask,
+                                    new_axis_mask, shrink_axis_mask);
+    test::ExpectClose(ddx[4], dy);
   }
 
   // test equivalent of python tf.gradients(foo[1, 1:, :-2, None])
@@ -549,6 +605,10 @@ TEST_F(ArrayGradTest, StridedSliceGrad) {
                           {2, 3, 4}));
     test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0, 0, 0}));
     test::ExpectTensorEqual<int32>(dx[2], test::AsTensor<int32>({0, 0, 0, 0}));
+    auto ddx = StridedSliceGradGrad(x_shape, start, stop, strides, dy, dx[0],
+                                    begin_mask, end_mask, ellipsis_mask,
+                                    new_axis_mask, shrink_axis_mask);
+    test::ExpectClose(ddx[4], dy);
   }
 
   // test equivalent of tf.gradients(foo[1, ...]) i.e. foo[1, 0:3, 0:4]
@@ -573,6 +633,10 @@ TEST_F(ArrayGradTest, StridedSliceGrad) {
                           {2, 3, 4}));
     test::ExpectTensorEqual<int32>(dx[1], test::AsTensor<int32>({0, 0}));
     test::ExpectTensorEqual<int32>(dx[2], test::AsTensor<int32>({0, 0}));
+    auto ddx = StridedSliceGradGrad(x_shape, start, stop, strides, dy, dx[0],
+                                    begin_mask, end_mask, ellipsis_mask,
+                                    new_axis_mask, shrink_axis_mask);
+    test::ExpectClose(ddx[4], dy);
   }
 }
 
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index ec0bfa32848..a6968888678 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -344,6 +344,34 @@ REGISTER_OP("Split")
     .Output("output: num_split * T")
     .Attr("num_split: int >= 1")
     .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      const Dimension* split_dimension;
+      TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(0, &split_dimension));
+      int num_split = c->num_outputs();
+      const Shape* input = c->input(1);
+      const Shape* out;
+      if (!c->ValueKnown(split_dimension)) {
+        if (c->RankKnown(input)) {
+          std::vector<const Dimension*> dims;
+          dims.resize(c->Rank(input));
+          for (int i = 0; i < dims.size(); ++i) dims[i] = c->UnknownDim();
+          out = c->MakeShape(dims);
+        } else {
+          out = c->UnknownShape();
+        }
+      } else {
+        int64 split_dim = c->Value(split_dimension);
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(input, split_dim + 1, &input));
+        const Dimension* split_dim_size;
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(
+            c->Divide(c->Dim(input, split_dim), num_split, &split_dim_size),
+            "Number of ways to split should evenly divide the split dimension");
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(input, split_dim, split_dim_size, &out));
+      }
+      for (int i = 0; i < num_split; ++i) c->set_output(i, out);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Splits a tensor into `num_split` tensors along one dimension.
 
@@ -739,7 +767,9 @@ REGISTER_OP("Reverse")
     .Input("tensor: T")
     .Input("dims: bool")
     .Output("output: T")
-    .Attr("T: {uint8, int8, int32, bool, half, float, double, complex64, complex128}")
+    .Attr(
+        "T: {uint8, int8, int32, bool, half, float, double, complex64, "
+        "complex128}")
     .SetShapeFn([](InferenceContext* c) {
       const Shape* input = c->input(0);
       const Shape* dims;
@@ -819,6 +849,32 @@ REGISTER_OP("EditDistance")
     .Attr("normalize: bool = true")
     .Attr("T: type")
     .Output("output: float")
+    .SetShapeFn([](InferenceContext* c) {
+      const Tensor* hypothesis_shape_t = c->input_tensor(2);
+      const Tensor* truth_shape_t = c->input_tensor(5);
+      if (hypothesis_shape_t == nullptr || truth_shape_t == nullptr) {
+        // We need to know the runtime shape of the two tensors,
+        // or else the output shape is unknown.
+        return shape_inference::UnknownShape(c);
+      }
+
+      if (hypothesis_shape_t->NumElements() != truth_shape_t->NumElements()) {
+        return errors::InvalidArgument(
+            "Num elements of hypothesis_shape does not match truth_shape: ",
+            hypothesis_shape_t->NumElements(), " vs. ",
+            truth_shape_t->NumElements());
+      }
+
+      auto h_values = hypothesis_shape_t->flat<int64>();
+      auto t_values = truth_shape_t->flat<int64>();
+      std::vector<const Dimension*> dims(hypothesis_shape_t->NumElements() - 1);
+      for (int i = 0; i < dims.size(); ++i) {
+        dims[i] = c->MakeDim(std::max(h_values(i), t_values(i)));
+      }
+
+      c->set_output(0, c->MakeShape(dims));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes the (possibly normalized) Levenshtein Edit Distance.
 
@@ -1146,6 +1202,58 @@ REGISTER_OP("Reshape")
     .Input("shape: int32")
     .Output("output: T")
     .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* in = c->input(0);
+      const Shape* out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
+
+      // If the rank and all dimensions of the input tensor are known, we may
+      // infer missing shape information or perform shape checks.
+      // NumElements conveniently returns kUnknownDim upon missing rank or
+      // dimension information.
+      // Additionally, if the rank of the out shape is unknown we have no shape
+      // information to go off of.
+      const Dimension* num_in_elems = c->NumElements(in);
+      const Dimension* num_out_elems = c->NumElements(out);
+      if (!c->ValueKnown(num_in_elems) || !c->RankKnown(out)) {
+        // Do nothing. We have no shape information to infer from so we directly
+        // return out as our shape.
+      } else if (c->ValueKnown(num_out_elems)) {
+        // If we know the number of output elements, we ensure that they
+        // are equal to the number of input elements.
+        if (c->Value(num_in_elems) != c->Value(num_out_elems)) {
+          return errors::InvalidArgument(
+              "Cannot reshape a tensor with ", c->DebugString(num_in_elems),
+              " elements to shape ", c->DebugString(out), " (",
+              c->DebugString(num_out_elems), " elements)");
+        }
+      } else {
+        // If we don't know the number of output elements, we can infer
+        // the missing dimension.
+        int32 unknown_idx = -1;
+        const Dimension* known_elems = c->MakeDim(1);
+        for (int32 i = 0; i < c->Rank(out); ++i) {
+          const Dimension* dim = c->Dim(out, i);
+          if (!c->ValueKnown(dim)) {
+            if (unknown_idx >= 0) {
+              return errors::InvalidArgument(
+                  "Cannot infer multiple unknown dimensions in shape ",
+                  c->DebugString(out));
+            }
+            unknown_idx = i;
+          } else {
+            TF_RETURN_IF_ERROR(c->Multiply(known_elems, dim, &known_elems));
+          }
+        }
+        const Dimension* inferred_dim;
+        TF_RETURN_IF_ERROR(
+            c->Divide(num_in_elems, c->Value(known_elems), &inferred_dim));
+        TF_RETURN_IF_ERROR(c->ReplaceDim(out, unknown_idx, inferred_dim, &out));
+      }
+
+      c->set_output(0, out);
+      return Status::OK();
+    })
     .Doc(R"Doc(
 Reshapes a tensor.
 
@@ -1248,6 +1356,58 @@ REGISTER_OP("Transpose")
     .Input("perm: int32")
     .Output("y: T")
     .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input = c->input(0);
+      const Shape* perm_shape = c->input(1);
+      const Tensor* perm = c->input_tensor(1);
+      const Dimension* perm_elems = c->NumElements(perm_shape);
+      // If we don't have rank information on the input or value information on
+      // perm we can't return any shape information, otherwise we have enough
+      // information to at least find the rank of the output.
+      if (!c->RankKnown(input) && !c->ValueKnown(perm_elems) &&
+          perm == nullptr) {
+        c->set_output(0, c->UnknownShape());
+        return Status::OK();
+      }
+
+      // Find our value of the rank.
+      int64 rank;
+      if (c->RankKnown(input)) {
+        rank = c->Rank(input);
+      } else if (c->ValueKnown(perm_elems)) {
+        rank = c->Value(perm_elems);
+      } else {
+        rank = perm->NumElements();
+      }
+      std::vector<const Dimension*> dims;
+      dims.resize(rank);
+      TF_RETURN_IF_ERROR(c->WithRank(input, rank, &input));
+      // Ensure that perm is a vector and has rank elements.
+      TF_RETURN_IF_ERROR(c->WithRank(perm_shape, 1, &perm_shape));
+      TF_RETURN_IF_ERROR(c->WithValue(perm_elems, rank, &perm_elems));
+
+      // If we know the rank of the input and the value of perm, we can return
+      // all shape informantion, otherwise we can only return rank information,
+      // but no information for the dimensions.
+      if (perm != nullptr) {
+        auto flat_perm = perm->flat<int32>();
+        for (int32 i = 0; i < rank; ++i) {
+          int32 in_idx = flat_perm(i);
+          if (in_idx >= rank) {
+            return errors::InvalidArgument(
+                "perm dim ", in_idx, " is out of range of input rank ", rank);
+          }
+          dims[i] = c->Dim(input, in_idx);
+        }
+      } else {
+        for (int i = 0; i < rank; ++i) {
+          dims[i] = c->UnknownDim();
+        }
+      }
+
+      c->set_output(0, c->MakeShape(dims));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Shuffle dimensions of x according to a permutation.
 
@@ -1337,7 +1497,7 @@ Status ShapeShapeFn(InferenceContext* c) {
   for (int i = 0; i < c->num_inputs(); ++i) {
     const Dimension* dim;
     if (c->RankKnown(c->input(i))) {
-      dim = c->MakeDim(c->Rank(c->input(0)));
+      dim = c->MakeDim(c->Rank(c->input(i)));
     } else {
       dim = c->UnknownDim();
     }
@@ -1373,6 +1533,7 @@ REGISTER_OP("ShapeN")
     .Output("output: N * int32")
     .Attr("N: int")
     .Attr("T: type")
+    .SetShapeFn(ShapeShapeFn)
     .Doc(R"doc(
 Returns shape of tensors.
 
@@ -1387,6 +1548,42 @@ REGISTER_OP("ReverseSequence")
     .Attr("seq_dim: int")
     .Attr("batch_dim: int = 0")
     .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input = c->input(0);
+      const Shape* seq_lens_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &seq_lens_shape));
+
+      int64 seq_dim;
+      TF_RETURN_IF_ERROR(c->GetAttr("seq_dim", &seq_dim));
+      int64 batch_dim;
+      TF_RETURN_IF_ERROR(c->GetAttr("batch_dim", &batch_dim));
+
+      if (!c->RankKnown(input)) {
+        return shape_inference::UnknownShape(c);
+      }
+
+      // Validate batch_dim and seq_dim against input.
+      const int32 input_rank = c->Rank(input);
+      if (batch_dim >= input_rank) {
+        return errors::InvalidArgument("batch_dim must be < input rank: ",
+                                       batch_dim, " vs. ", input_rank);
+      }
+      if (seq_dim >= input_rank) {
+        return errors::InvalidArgument("seq_dim must be < input rank: ",
+                                       seq_dim, " vs. ", input_rank);
+      }
+
+      const Dimension* batch_dim_dim = c->Dim(input, batch_dim);
+      TF_RETURN_IF_ERROR(
+          c->Merge(batch_dim_dim, c->Dim(seq_lens_shape, 0), &batch_dim_dim));
+
+      // Replace batch_dim of input with batch_size
+      const Shape* output_shape;
+      TF_RETURN_IF_ERROR(
+          c->ReplaceDim(input, batch_dim, batch_dim_dim, &output_shape));
+      c->set_output(0, output_shape);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Reverses variable length slices.
 
@@ -1458,6 +1655,7 @@ REGISTER_OP("Rank")
     .Input("input: T")
     .Output("output: int32")
     .Attr("T: type")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Returns the rank of a tensor.
 
@@ -1481,6 +1679,7 @@ REGISTER_OP("Size")
     .Input("input: T")
     .Output("output: int32")
     .Attr("T: type")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Returns the size of a tensor.
 
@@ -1563,7 +1762,7 @@ begin_mask: a bitmask where a bit i being 1 means to ignore the begin
   begin[i] will be replaced with `[0, n-1) if `stride[i] > 0` or
   `[-1, n-1]` if `stride[i] < 0`
 end_mask: analogous to `begin_mask`
-ellipsis_mask: a bitmask where bit `i` being 1 means the `i`th 
+ellipsis_mask: a bitmask where bit `i` being 1 means the `i`th
   position is actually an ellipsis. One bit at most can be 1.
 new_axis_mask: a bitmask where bit `i` being 1 means the `i`th
   position creates a dimension in the tensor of length 1. Thus
@@ -1572,7 +1771,7 @@ new_axis_mask: a bitmask where bit `i` being 1 means the `i`th
 shrink_axis_mask: a bitmask where bit `i` implies that the `i`th
   position should shrink the dimensionality. begin and end
   must imply a slice of size 1 in the dimension. For example in
-  python one might do `foo[:,3,:]` which would result in 
+  python one might do `foo[:,3,:]` which would result in
   `shrink_axis_mask` being 2.
 )doc");
 
@@ -1599,7 +1798,7 @@ as `shape`). The gradient will be zero in any element that the slice
 does not select.
 
 Arguments are the same as StridedSliceGrad with the exception that
-`dy` is the input gradient to be propagated and `shape` is the 
+`dy` is the input gradient to be propagated and `shape` is the
 shape of `StridedSlice`'s `input`.
 )doc");
 
@@ -1609,6 +1808,44 @@ REGISTER_OP("Tile")
     .Input("multiples: int32")
     .Output("output: T")
     .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input;
+      const Shape* multiples;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &multiples));
+      const Dimension* multiples_dim0 = c->Dim(multiples, 0);
+      if (!c->ValueKnown(multiples_dim0)) {
+        // Length of multiples vector unknown, so output is unknown.
+        //
+        // NOTE: we could potentially merge the input rank with the
+        // multiples length.
+        return shape_inference::UnknownShape(c);
+      }
+
+      int32 rank = c->Value(multiples_dim0);
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), rank, &input));
+      const Tensor* multiples_t = c->input_tensor(1);
+      if (multiples_t == nullptr) {
+        // If multiples vector isn't available, we only know the
+        // output rank, not the sizes.
+        std::vector<const Dimension*> dims;
+        for (int64 i = 0; i < rank; ++i) {
+          dims.push_back(c->UnknownDim());
+        }
+        c->set_output(0, c->MakeShape(dims));
+        return Status::OK();
+      }
+
+      // Multiply each input dimension by its corresponding value
+      // from the multiples tensor.
+      auto multiples_data = multiples_t->vec<int32>();
+      std::vector<const Dimension*> dims(rank);
+      for (int i = 0; i < rank; ++i) {
+        const int32 multiple = multiples_data(i);
+        TF_RETURN_IF_ERROR(c->Multiply(c->Dim(input, i), multiple, &dims[i]));
+      }
+      c->set_output(0, c->MakeShape(dims));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Constructs a tensor by tiling a given tensor.
 
@@ -1638,7 +1875,14 @@ each repeated tile of `input` into `output`.
 )doc");
 
 // --------------------------------------------------------------------------
-REGISTER_OP("Where").Input("input: bool").Output("index: int64").Doc(R"doc(
+REGISTER_OP("Where")
+    .Input("input: bool")
+    .Output("index: int64")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), c->Rank(c->input(0))));
+      return Status::OK();
+    })
+    .Doc(R"doc(
 Returns locations of true values in a boolean tensor.
 
 This operation returns the coordinates of true elements in `input`. The
@@ -1786,6 +2030,49 @@ REGISTER_OP("MirrorPadGrad")
     .Output("output: T")
     .Attr("T: type")
     .Attr(GetMirrorPadModeAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* paddings;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &paddings));
+      const Dimension* pad_0 = c->Dim(paddings, 0);
+      if (!c->ValueKnown(pad_0)) {
+        // We don't know the rank of the output since the first
+        // padding dimension is unknown.
+        c->set_output(0, c->UnknownShape());
+        return Status::OK();
+      }
+
+      int64 input_rank = c->Value(pad_0);
+      const Shape* input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), input_rank, &input));
+      TF_RETURN_IF_ERROR(
+          c->Merge(paddings, c->Matrix(input_rank, 2), &paddings));
+
+      const Tensor* paddings_t = c->input_tensor(1);
+      if (paddings_t == nullptr) {
+        // Values of 'paddings' is not available, but we know the
+        // input rank, so return the rank of the output with unknown
+        // dimensions.
+        std::vector<const Dimension*> dims;
+        for (int64 i = 0; i < input_rank; ++i) dims.push_back(c->UnknownDim());
+        c->set_output(0, c->MakeShape(dims));
+        return Status::OK();
+      }
+
+      auto paddings_data = paddings_t->matrix<int32>();
+      std::vector<const Dimension*> dims(input_rank);
+      for (int i = 0; i < input_rank; ++i) {
+        const int64 pad0 = static_cast<int64>(paddings_data(i, 0));
+        const int64 pad1 = static_cast<int64>(paddings_data(i, 1));
+        if (pad0 < 0 || pad1 < 0) {
+          return errors::InvalidArgument("Paddings must be non-negative");
+        }
+
+        TF_RETURN_IF_ERROR(
+            c->Subtract(c->Dim(input, i), pad0 + pad1, &dims[i]));
+      }
+      c->set_output(0, c->MakeShape(dims));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
 
@@ -1820,6 +2107,25 @@ REGISTER_OP("Placeholder")
     .Output("output: dtype")
     .Attr("dtype: type")
     .Attr("shape: shape = {}")
+    .SetShapeFn([](InferenceContext* c) {
+      PartialTensorShape shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
+
+      // Placeholder has a legacy bug where we cannot tell
+      // the difference between a scalar shape attribute and
+      // 'unknown shape'.  So if the shape is a scalar, we return
+      // an unknown shape.
+      if (shape.dims() == 0) {
+        return shape_inference::UnknownShape(c);
+      }
+
+      TensorShapeProto shape_proto;
+      shape.AsProto(&shape_proto);
+      const Shape* out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(shape_proto, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
     .Doc(R"doc(
 A placeholder op for a value that will be fed into the computation.
 
@@ -1839,6 +2145,22 @@ REGISTER_OP("PlaceholderWithDefault")
     .Output("output: dtype")
     .Attr("dtype: type")
     .Attr("shape: shape")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input = c->input(0);
+      PartialTensorShape shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
+      TensorShapeProto shape_proto;
+      shape.AsProto(&shape_proto);
+      const Shape* out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(shape_proto, &out));
+
+      // We merge for compatibility checking, but return the output,
+      // since output_shape may be less precise than input_shape.
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->Merge(input, out, &unused));
+      c->set_output(0, out);
+      return Status::OK();
+    })
     .Doc(R"doc(
 A placeholder op that passes though `input` when its output is not fed.
 
@@ -1926,6 +2248,67 @@ REGISTER_OP("Squeeze")
     .Output("output: T")
     .Attr("T: type")
     .Attr("squeeze_dims: list(int) >= 0 = []")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input = c->input(0);
+      if (!c->RankKnown(input)) {
+        // Input shape unknown.
+        return shape_inference::UnknownShape(c);
+      }
+
+      const int32 input_rank = c->Rank(input);
+
+      // Validate and wrap squeeze dimensions.
+      std::vector<int32> squeeze_dims;
+      TF_RETURN_IF_ERROR(c->GetAttr("squeeze_dims", &squeeze_dims));
+      for (int i = 0; i < squeeze_dims.size(); ++i) {
+        if (squeeze_dims[i] < -input_rank || squeeze_dims[i] >= input_rank) {
+          return errors::InvalidArgument("squeeze_dims[", i, "] not in [",
+                                         -input_rank, ",", input_rank, ").");
+        }
+
+        if (squeeze_dims[i] < 0) {
+          squeeze_dims[i] += input_rank;
+        }
+      }
+
+      std::vector<const Dimension*> result_shape;
+      for (int i = 0; i < input_rank; ++i) {
+        // True if squeeze_dims contains an entry to squeeze this
+        // dimension.
+        bool is_explicit_match =
+            std::find(squeeze_dims.begin(), squeeze_dims.end(), i) !=
+            squeeze_dims.end();
+
+        const Dimension* dim = c->Dim(input, i);
+
+        if (!c->ValueKnown(dim)) {
+          // Assume that the squeezed dimension will be 1 at runtime.
+          if (is_explicit_match) continue;
+
+          // If squeezing all 1 dimensions, and we see an unknown value,
+          // give up and return Unknown Shape.
+          if (squeeze_dims.empty()) {
+            c->set_output(0, c->UnknownShape());
+            return Status::OK();
+          }
+        } else if (c->Value(dim) == 1) {
+          if (is_explicit_match || squeeze_dims.empty()) {
+            // If explicitly squeezing, or squeezing all 1s, remove
+            // this dimension.
+            continue;
+          }
+        } else if (is_explicit_match) {
+          return errors::InvalidArgument("Can not squeeze dim[", i,
+                                         "], expected a dimension of 1, got ",
+                                         c->Value(c->Dim(input, i)));
+        }
+
+        result_shape.emplace_back(dim);
+      }
+
+      c->set_output(0, c->MakeShape(result_shape));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Removes dimensions of size 1 from the shape of a tensor.
 
@@ -2389,6 +2772,76 @@ REGISTER_OP("ExtractImagePatches")
     .Attr("rates: list(int) >= 4")
     .Attr("T: realnumbertype")
     .Attr(GetPaddingAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+
+      std::vector<int32> ksizes;
+      TF_RETURN_IF_ERROR(c->GetAttr("ksizes", &ksizes));
+      if (ksizes.size() != 4) {
+        return errors::InvalidArgument(
+            "ExtractImagePatches requires the ksizes attribute to contain 4 "
+            "values, but got: ",
+            ksizes.size());
+      }
+
+      std::vector<int32> strides;
+      TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
+      if (strides.size() != 4) {
+        return errors::InvalidArgument(
+            "ExtractImagePatches requires the stride attribute to contain 4 "
+            "values, but got: ",
+            strides.size());
+      }
+
+      std::vector<int32> rates;
+      TF_RETURN_IF_ERROR(c->GetAttr("rates", &rates));
+      if (rates.size() != 4) {
+        return errors::InvalidArgument(
+            "ExtractImagePatches requires the rates attribute to contain 4 "
+            "values, but got: ",
+            rates.size());
+      }
+
+      int32 ksize_rows = ksizes[1];
+      int32 ksize_cols = ksizes[2];
+
+      int32 stride_rows = strides[1];
+      int32 stride_cols = strides[2];
+
+      int32 rate_rows = rates[1];
+      int32 rate_cols = rates[2];
+
+      int32 ksize_rows_eff = ksize_rows + (ksize_rows - 1) * (rate_rows - 1);
+      int32 ksize_cols_eff = ksize_cols + (ksize_cols - 1) * (rate_cols - 1);
+
+      const Dimension* batch_size_dim = c->Dim(input_shape, 0);
+      const Dimension* in_rows_dim = c->Dim(input_shape, 1);
+      const Dimension* in_cols_dim = c->Dim(input_shape, 2);
+      const Dimension* output_depth_dim = c->Dim(input_shape, 3);
+
+      // At the moment we need to know the values of several fields.
+      TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+      TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
+      auto in_rows = c->Value(in_rows_dim);
+      auto in_cols = c->Value(in_cols_dim);
+
+      Padding padding;
+      TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
+
+      int64 output_rows, output_cols;
+      int64 padding_before, padding_after;
+      TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+          in_rows, ksize_rows_eff, stride_rows, padding, &output_rows,
+          &padding_before, &padding_after));
+      TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+          in_cols, ksize_cols_eff, stride_cols, padding, &output_cols,
+          &padding_before, &padding_after));
+      const Shape* output_shape = c->MakeShape(
+          {batch_size_dim, output_rows, output_cols, output_depth_dim});
+      c->set_output(0, output_shape);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Extract `patches` from `images` and put them in the "depth" output dimension.
 
@@ -2420,6 +2873,55 @@ REGISTER_OP("Bitcast")
     .Output("output: type")
     .Attr("T: numbertype")
     .Attr("type: numbertype")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input = c->input(0);
+      if (!c->RankKnown(input)) {
+        // Input shape unknown.
+        return shape_inference::UnknownShape(c);
+      }
+
+      // Find the size of the input and output data types.
+      DataType input_type;
+      DataType output_type;
+      TF_RETURN_IF_ERROR(c->GetAttr("T", &input_type));
+      TF_RETURN_IF_ERROR(c->GetAttr("type", &output_type));
+      const int input_type_size = DataTypeSize(input_type);
+      const int output_type_size = DataTypeSize(output_type);
+
+      if (input_type_size == 0 || output_type_size == 0) {
+        return errors::InvalidArgument("Cannot bitcast types ",
+                                       DataTypeString(input_type), " to ",
+                                       DataTypeString(output_type),
+                                       " because "
+                                       "one of the type sizes is zero.");
+      }
+
+      const Shape* new_shape;
+      if (input_type_size == output_type_size) {
+        // No change in size.
+        new_shape = input;
+      } else if (input_type_size < output_type_size) {
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(input, 1, &new_shape));
+
+        int64 divisor_val = output_type_size / input_type_size;
+        const Dimension* last_dim = c->Dim(new_shape, -1);
+        if (!c->ValueKnown(last_dim) || c->Value(last_dim) == divisor_val) {
+          TF_RETURN_IF_ERROR(c->Subshape(new_shape, 0, -1, &new_shape));
+        } else {
+          return errors::InvalidArgument("Cannot bitcast due to shape. ",
+                                         c->Value(last_dim), " does not match ",
+                                         divisor_val);
+        }
+      } else {
+        // Input type size is larger than output type size.
+        int64 divisor_val = input_type_size / output_type_size;
+        const Shape* extension = c->Vector(divisor_val);
+        TF_RETURN_IF_ERROR(c->Concatenate(input, extension, &new_shape));
+      }
+
+      c->set_output(0, new_shape);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Bitcasts a tensor from one type to another without copying data.
 
@@ -2446,6 +2948,32 @@ REGISTER_OP("OneHot")
     .Output("output: T")
     .Attr("T: type")
     .Attr("TI: {uint8, int32, int64} = DT_INT64")
+    .SetShapeFn([](InferenceContext* c) {
+      int32 axis;
+      TF_RETURN_IF_ERROR(c->GetAttr("axis", &axis));
+      if (axis < -1) return errors::InvalidArgument("axis must be >= -1");
+
+      const Dimension* depth;
+      TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(1, &depth));
+
+      const Shape* indices = c->input(0);
+      if (!c->RankKnown(indices)) return shape_inference::UnknownShape(c);
+
+      int32 new_rank = c->Rank(indices) + 1;
+      // We need to add new_rank to axis in the case the axis is -1 because
+      // C++ returns negative values from % if the dividend is negative.
+      int32 depth_index = (axis + new_rank) % new_rank;
+      // Out shape is indices[0:depth_index] + [depth] + indices[depth_index:].
+      const Shape* front;
+      const Shape* back;
+      const Shape* out;
+      TF_RETURN_IF_ERROR(c->Subshape(indices, 0, depth_index, &front));
+      TF_RETURN_IF_ERROR(c->Subshape(indices, depth_index, &back));
+      TF_RETURN_IF_ERROR(c->Concatenate(front, c->Vector(depth), &front));
+      TF_RETURN_IF_ERROR(c->Concatenate(front, back, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Returns a one-hot tensor.
 
@@ -2660,6 +3188,7 @@ REGISTER_OP("DebugIdentity")
     .Output("output: T")
     .Attr("T: type")
     .Attr("tensor_name: string = ''")
+    .Attr("debug_urls: list(string) = []")
     .Doc(R"doc(
 Debug Identity Op.
 
@@ -2668,6 +3197,8 @@ Provides an identity mapping of the non-Ref type input tensor for debugging.
 input: Input tensor, non-Reference type.
 output: Output tensor that equals the input tensor.
 tensor_name: Name of the input tensor.
+debug_urls: List of URLs to debug targets, e.g.,
+            file:///foo/tfdbg_dump, grpc:://localhost:11011
 )doc");
 
 REGISTER_OP("DebugNanCount")
@@ -2675,6 +3206,7 @@ REGISTER_OP("DebugNanCount")
     .Output("output: int64")  // The debug signal (nan count) is int64
     .Attr("T: type")
     .Attr("tensor_name: string = ''")
+    .Attr("debug_urls: list(string) = []")
     .Doc(R"doc(
 Debug NaN Value Counter Op
 
@@ -2683,6 +3215,8 @@ Counts number of NaNs in the input tensor, for debugging.
 input: Input tensor, non-Reference type.
 output: An integer output tensor that is the number of NaNs in the input.
 tensor_name: Name of the input tensor.
+debug_urls: List of URLs to debug targets, e.g.,
+            file:///foo/tfdbg_dump, grpc:://localhost:11011
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 8c103ec8229..6345db128e7 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -237,6 +237,20 @@ TEST(ArrayOpsTest, Shape_ShapeFn) {
   INFER_OK(op, "[?,2,3,4,5]", "[5]");
 }
 
+TEST(ArrayOpsTest, ShapeN_ShapeFn) {
+  ShapeInferenceTestOp op("ShapeN");
+  int n = 3;
+  std::vector<NodeDefBuilder::NodeOut> src_list;
+  for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
+  TF_CHECK_OK(NodeDefBuilder("test", "ShapeN")
+                  .Input(src_list)
+                  .Attr("N", n)
+                  .Finalize(&op.node_def));
+  INFER_OK(op, "?;?;?", "[?];[?];[?]");
+  INFER_OK(op, "[?];[?];[?]", "[1];[1];[1]");
+  INFER_OK(op, "[?,2,3,4,5];?;[1,?,3]", "[5];[?];[3]");
+}
+
 TEST(ArrayOpsTest, Unique_ShapeFn) {
   ShapeInferenceTestOp op("Unique");
   INFER_OK(op, "?", "[?];in0");
@@ -287,6 +301,38 @@ TEST(ArrayOpsTest, PadD_ShapeFn) {
   }
 }
 
+TEST(ArrayOpsTest, MirrorPadGrad_ShapeFn) {
+  ShapeInferenceTestOp op("MirrorPadGrad");
+  op.input_tensors.resize(2);
+
+  // Inputs are input and paddings.
+  INFER_OK(op, "?;?", "?");
+
+  // First padding dimension is unknown, so rank is unknown.
+  INFER_OK(op, "?;[?,4]", "?");
+
+  // Input tensor rank doesn't match paddings dimension.
+  INFER_ERROR("must be rank 3 but is rank 2", op, "[?,?];[3,2]");
+
+  // Paddings tensor is not a [rank x 2] matrix.
+  INFER_ERROR("Dimension 1 in both shapes must be equal, but are 3 and 2", op,
+              "[?,?,?];[3,3]");
+
+  // Paddings tensor is unknown, but rank is known, so the output
+  // shape is a rank 3 unknown shape.
+  INFER_OK(op, "[?,?,?];[3,2]", "[?,?,?]");
+
+  // Make the paddings tensor known and verify padding values get
+  // subtracted.  E.g., if padding is ((1,10),(2,20),(3,30)) then
+  // values 11,22,23 are subtracted to input dims to get output.
+  Tensor paddings_t(DT_INT32, TensorShape{3, 2});
+  test::FillValues<int32>(&paddings_t, {1, 10, 2, 20, 3, 30});
+  op.input_tensors[1] = &paddings_t;
+
+  INFER_OK(op, "[111,222,333];[3,2]", "[100,200,300]");
+  INFER_OK(op, "[111,?,333];[3,2]", "[100,?,300]");
+}
+
 TEST(ArrayOpsTest, BroadcastGradientArgs_ShapeFn) {
   ShapeInferenceTestOp op("BroadcastGradientArgs");
   // Output is always two unknown vectors.
@@ -476,4 +522,391 @@ TEST(ArrayOpsTest, ConcatOffset_ShapeFn) {
   INFER_OK(op, "?;?;?;?;?", "in1;in2;in3;in4");
 }
 
+TEST(ArrayOpsTest, Reshape_ShapeFn) {
+  ShapeInferenceTestOp op("Reshape");
+  op.input_tensors.resize(2);
+
+  // No valid shape provided.
+  INFER_OK(op, "?;?", "?");
+  INFER_OK(op, "[?];?", "?");
+  INFER_OK(op, "[?];[?]", "?");
+  INFER_OK(op, "[4];[?]", "?");
+
+  // All dimensions provided.
+  Tensor new_shape = test::AsTensor<int32>({1, 2, 3});
+  op.input_tensors[1] = &new_shape;
+  INFER_OK(op, "[?];[3]", "[1,2,3]");
+  INFER_OK(op, "[6];[3]", "[1,2,3]");
+  // The number of elements should match for the reshape to succeed.
+  INFER_ERROR(
+      "Cannot reshape a tensor with 12 elements to shape [1,2,3] (6 elements)",
+      op, "[3,4];[3]");
+
+  // Unknown dimensions.
+  // Flatten:
+  new_shape = test::AsTensor<int32>({-1});
+  INFER_OK(op, "[?];[1]", "[?]");
+  INFER_OK(op, "[2,2];[1]", "[4]");
+  // The first dimension is inferred:
+  new_shape = test::AsTensor<int32>({2, -1});
+  INFER_OK(op, "[3,4];[2]", "[2,6]");
+  // The total number of elements must be divisible by the known dimensions.
+  INFER_ERROR("Dimension size must be divisible by 2 but is 7", op, "[7];[2]");
+  // Multiple missing dimensions cannot be inferred.
+  new_shape = test::AsTensor<int32>({-1, -1, 2});
+  INFER_ERROR("Cannot infer multiple unknown dimensions in shape [?,?,2]", op,
+              "[8];[3]");
+
+  // Reshaping to a scalar.
+  new_shape = test::AsTensor<int32>({});
+  INFER_OK(op, "[1];[0]", "[]");
+  INFER_ERROR(
+      "Cannot reshape a tensor with 2 elements to shape [] (1 elements)", op,
+      "[1,2];[0]");
+}
+
+TEST(ArrayOpsTest, Placeholder_ShapeFn) {
+  {
+    // 2D shape
+    ShapeInferenceTestOp op("Placeholder");
+    TensorShape shape({1, 2});
+    TF_CHECK_OK(NodeDefBuilder("test", "Placeholder")
+                    .Attr("shape", shape)
+                    .Attr("dtype", DT_FLOAT)
+                    .Finalize(&op.node_def));
+    INFER_OK(op, "", "[1,2]");
+  }
+
+  {
+    // Scalar shapes are unknown shapes due to legacy.
+    ShapeInferenceTestOp op("Placeholder");
+    TensorShape shape({});
+    TF_CHECK_OK(NodeDefBuilder("test", "Placeholder")
+                    .Attr("shape", shape)
+                    .Attr("dtype", DT_FLOAT)
+                    .Finalize(&op.node_def));
+    INFER_OK(op, "", "?");
+  }
+
+  {
+    // Partial shape
+    ShapeInferenceTestOp op("Placeholder");
+    const int64 dims[2] = {1, -1};
+    PartialTensorShape shape;
+    TF_CHECK_OK(PartialTensorShape::MakePartialShape(dims, 2, &shape));
+    TF_CHECK_OK(NodeDefBuilder("test", "Placeholder")
+                    .Attr("shape", shape)
+                    .Attr("dtype", DT_FLOAT)
+                    .Finalize(&op.node_def));
+    INFER_OK(op, "", "[1,?]");
+  }
+
+  {
+    ShapeInferenceTestOp op("PlaceholderWithDefault");
+    const int64 dims[2] = {1, -1};
+    PartialTensorShape shape;
+    TF_CHECK_OK(PartialTensorShape::MakePartialShape(dims, 2, &shape));
+    TF_CHECK_OK(NodeDefBuilder("test", "PlaceholderWithDefault")
+                    .Input("input", 0, DT_FLOAT)
+                    .Attr("shape", shape)
+                    .Attr("dtype", DT_FLOAT)
+                    .Finalize(&op.node_def));
+    INFER_OK(op, "[1,2]", "[1,?]");
+
+    // input shape is not compatible with output shape.
+    INFER_ERROR("Dimension 0 in both shapes must be equal, but are 2 and 1", op,
+                "[2,3]");
+    // Wrong rank
+    INFER_ERROR("Shapes must be equal rank, but are 3 and 2", op, "[1,3,10]");
+  }
+}
+
+TEST(ArrayOpsTest, Transpose_ShapeFn) {
+  ShapeInferenceTestOp op("Transpose");
+  op.input_tensors.resize(2);
+
+  // Missing shape information.
+  INFER_OK(op, "?;?", "?");
+  INFER_OK(op, "?;[?]", "?");
+  INFER_OK(op, "?;[2]", "[?,?]");
+  INFER_OK(op, "[?];?", "[?]");
+  INFER_OK(op, "[?,?];[2]", "[?,?]");
+  INFER_ERROR("Dimension must be 3 but is 2", op, "[1,2,3];[2]");
+  Tensor perm = test::AsTensor<int32>({0});
+  op.input_tensors[1] = &perm;
+  INFER_OK(op, "[?];[?]", "[d0_0]");
+  perm = test::AsTensor<int32>({1, 0});
+  INFER_OK(op, "?;[2]", "[?,?]");
+  INFER_OK(op, "[?,?];[2]", "[d0_1,d0_0]");
+  INFER_OK(op, "[1,?];[2]", "[d0_1,d0_0]");
+
+  // Invalid arguments.
+  perm = test::AsTensor<int32>({1, 2});
+  INFER_ERROR("perm dim 2 is out of range of input rank 2", op, "[1,2];[2]");
+  perm = test::AsTensor<int32>({0});
+  INFER_ERROR("Dimension must be 2 but is 1", op, "[1,2];[1]");
+
+  // Larger valid cases.
+  perm = test::AsTensor<int32>({1, 0, 3, 4, 2});
+  INFER_OK(op, "[0,1,2,3,4];[5]", "[d0_1,d0_0,d0_3,d0_4,d0_2]");
+  INFER_OK(op, "[0,?,2,3,4];[5]", "[d0_1,d0_0,d0_3,d0_4,d0_2]");
+}
+
+TEST(ArrayOpsTest, Bitcast_ShapeFn) {
+  ShapeInferenceTestOp op("Bitcast");
+  auto rebuild_node_def = [&op](DataType input_type, DataType output_type) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Bitcast")
+                    .Input("input", 0, input_type)
+                    .Attr("type", output_type)
+                    .Finalize(&op.node_def));
+  };
+
+  rebuild_node_def(DT_FLOAT, DT_INT32);
+  // No valid shape provided, so output is unknown.
+  INFER_OK(op, "?", "?");
+
+  // Bitcasting from two equal sizes propagates shape.
+  INFER_OK(op, "[1,2]", "in0");
+
+  // Bitcasting from smaller to larger reduces the size of the last dimension.
+  rebuild_node_def(DT_INT32, DT_INT64);
+  INFER_OK(op, "[1,2]", "[d0_0]");  // last dimension matches divisor.
+  // TODO(vrv): Seems like a bug, or at least, too lenient.
+  INFER_OK(op, "[1,?]", "[d0_0]");
+  // 4 is divisible by 2, but the shape function signature requires
+  // that the last dimension matches the last value exactly.
+  INFER_ERROR("does not match", op, "[1,4]");
+  INFER_ERROR("does not match", op, "[1,3]");
+
+  // Bitcasting from a larger type to a smaller type extends the dimension
+  rebuild_node_def(DT_INT64, DT_INT32);
+  INFER_OK(op, "[4,5]", "[d0_0,d0_1,2]");
+  rebuild_node_def(DT_COMPLEX128, DT_INT32);
+  INFER_OK(op, "[4,5]", "[d0_0,d0_1,4]");
+  rebuild_node_def(DT_COMPLEX128, DT_HALF);
+  INFER_OK(op, "[4,5]", "[d0_0,d0_1,8]");
+  rebuild_node_def(DT_COMPLEX128, DT_INT8);
+  INFER_OK(op, "[4,5]", "[d0_0,d0_1,16]");
+
+  // Bitcasting from a POD or quantized datatype is not allowed.
+  rebuild_node_def(DT_STRING, DT_INT32);
+  INFER_ERROR("one of the type sizes is zero", op, "[1,2,3]");
+  rebuild_node_def(DT_INT32, DT_STRING);
+  INFER_ERROR("one of the type sizes is zero", op, "[1,2,3]");
+}
+
+TEST(ArrayOpsTest, Squeeze_ShapeFn) {
+  ShapeInferenceTestOp op("Squeeze");
+
+  auto rebuild_node_def = [&op](const std::vector<int32>& squeeze_dims) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Squeeze")
+                    .Input("input", 0, DT_FLOAT)
+                    .Attr("squeeze_dims", squeeze_dims)
+                    .Finalize(&op.node_def));
+  };
+
+  // Default squeeze_dims = []
+  rebuild_node_def({});
+
+  // No valid shape provided, so output is unknown.
+  INFER_OK(op, "?", "?");
+
+  INFER_OK(op, "[1,4,1,5,1]", "[d0_1,d0_3]");
+
+  // Squeezing all dimensions, but see some unknown values.
+  INFER_OK(op, "[1,?,1,?,1]", "?");
+
+  // Test simple squeeze of an explicit dimension
+  rebuild_node_def({1});
+  INFER_OK(op, "[4,1,5]", "[d0_0,d0_2]");
+  // Squeezing unknown dim explicitly, assumes it's 1 at runtime.
+  INFER_OK(op, "[4,?,5]", "[d0_0,d0_2]");
+
+  // Attempt to squeeze non-one dimension
+  INFER_ERROR("Can not squeeze dim[1]", op, "[4,6,5]");
+
+  // Squeeze multiple dimensions
+  rebuild_node_def({1, 2});
+  INFER_OK(op, "[4,1,1,5]", "[d0_0,d0_3]");
+  rebuild_node_def({1, -2});
+  INFER_OK(op, "[4,1,1,5]", "[d0_0,d0_3]");
+
+  // Negative squeeze dim
+  rebuild_node_def({-2});
+  INFER_OK(op, "[4,1,5]", "[d0_0,d0_2]");
+
+  // Test validation of squeeze dimensions
+  rebuild_node_def({-4});
+  INFER_ERROR("not in [-3,3)", op, "[1,2,3]");
+  rebuild_node_def({3});
+  INFER_ERROR("not in [-3,3)", op, "[1,2,3]");
+}
+
+TEST(ArrayOpsTest, ReverseSequence_ShapeFn) {
+  ShapeInferenceTestOp op("ReverseSequence");
+  auto rebuild_node_def = [&op](const int32 seq_dim, const int32 batch_dim) {
+    TF_CHECK_OK(NodeDefBuilder("test", "ReverseSequence")
+                    .Input("input", 0, DT_FLOAT)
+                    .Input("seq_lengths", 1, DT_INT64)
+                    .Attr("seq_dim", seq_dim)
+                    .Attr("batch_dim", batch_dim)
+                    .Finalize(&op.node_def));
+  };
+
+  rebuild_node_def(1, 2);
+  // No valid shape provided, so output is unknown.
+  INFER_OK(op, "?;[10]", "?");
+
+  // Bad rank for seq_lengths
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "?;[10,10]");
+
+  // Validate seq_dim and batch_dim
+  rebuild_node_def(1, 4);
+  INFER_ERROR("batch_dim must be < input rank", op, "[1,2,3];[3]");
+  rebuild_node_def(4, 1);
+  INFER_ERROR("seq_dim must be < input rank", op, "[1,2,3];[3]");
+
+  rebuild_node_def(1, 2);
+  INFER_OK(op, "[1,2,3];[3]", "[d0_0,d0_1,d0_2]");
+  // Resolves uncertainty on batch dimension by merging.
+  INFER_OK(op, "[1,2,?];[3]", "[d0_0,d0_1,d1_0]");
+  INFER_OK(op, "[1,2,3];[?]", "[d0_0,d0_1,d0_2]");
+}
+
+TEST(ArrayOpsTest, Split_ShapeFn) {
+  ShapeInferenceTestOp op("Split");
+  op.input_tensors.resize(2);
+
+  // No value for split_dim and no input.
+  TF_CHECK_OK(NodeDefBuilder("test", "Split")
+                  .Input("split_dim", 0, DT_INT32)
+                  .Input("value", 1, DT_FLOAT)
+                  .Attr("num_split", 2)
+                  .Finalize(&op.node_def));
+  INFER_OK(op, "?;?", "?;?");
+  // If the rank is known, we know the rank of each output.
+  INFER_OK(op, "?;[?,?]", "[?,?];[?,?]");
+
+  // split_dim is known.
+  Tensor split_dim = test::AsTensor<int32>({1, 2});
+  op.input_tensors[0] = &split_dim;
+  INFER_ERROR("Input must be scalar but has rank 1", op, "[?];[?,?]");
+  split_dim = test::AsScalar<int32>(1);
+  INFER_OK(op, "?;?", "?;?");
+  INFER_OK(op, "?;[?,?]", "[d1_0,?];[d1_0,?]");
+  INFER_OK(op, "?;[1,4]", "[d1_0,2];[d1_0,2]");
+  INFER_OK(op, "?;[1,?]", "[d1_0,?];[d1_0,?]");
+  INFER_ERROR("Dimension size must be divisible by 2 but is 5", op, "?;[1,5]");
+}
+
+TEST(ArrayOpsTest, Tile_ShapeFn) {
+  ShapeInferenceTestOp op("Tile");
+  op.input_tensors.resize(2);
+
+  // No value for split_dim and no input.
+  TF_CHECK_OK(NodeDefBuilder("test", "Tile")
+                  .Input("input", 0, DT_FLOAT)
+                  .Input("multiples", 1, DT_INT32)
+                  .Finalize(&op.node_def));
+
+  // If multiples rank is unknown, output is unknown.
+  INFER_OK(op, "[2,3,1,4];?", "?");
+
+  // Bad rank for 'multiples'
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[2,3,1,4];[4,1]");
+
+  // No multiples tensor available, but output rank is known.
+  INFER_OK(op, "[2,3,1,4];[4]", "[?,?,?,?]");
+
+  // Test a tile of a 4D input.
+  Tensor multiples = test::AsTensor<int32>({2, 3, 4, 5});
+  op.input_tensors[1] = &multiples;
+  INFER_OK(op, "[2,3,1,4];[4]", "[4,9,4,20]");
+}
+
+TEST(ArrayOpsTest, EditDistance_ShapeFn) {
+  ShapeInferenceTestOp op("EditDistance");
+  op.input_tensors.resize(6);
+
+  // If the shape tensors are not available, the output shape is unknown.
+  INFER_OK(op, "[?];[?];[4];[?];[?];[4]", "?");
+
+  Tensor hypothesis_shape = test::AsTensor<int64>({2, 30, 4, 50});
+  op.input_tensors[2] = &hypothesis_shape;
+  Tensor truth_shape = test::AsTensor<int64>({20, 3, 40, 5});
+  op.input_tensors[5] = &truth_shape;
+  INFER_OK(op, "[?];[?];[4];[?];[?];[4]", "[20,30,40]");
+
+  // Shape elements don't match
+  hypothesis_shape = test::AsTensor<int64>({2});
+  op.input_tensors[2] = &hypothesis_shape;
+  INFER_ERROR("Num elements of hypothesis_shape does not match truth_shape", op,
+              "[?];[?];[1];[?];[?];[4]");
+}
+
+TEST(ArrayOpsTest, OneHot_ShapeFn) {
+  ShapeInferenceTestOp op("OneHot");
+  op.input_tensors.resize(4);
+  auto set_axis = [&op](int axis) {
+    TF_CHECK_OK(NodeDefBuilder("test", "OneHot")
+                    .Input("indices", 0, DT_FLOAT)
+                    .Input("depth", 1, DT_INT32)
+                    .Input("on_value", 2, DT_FLOAT)
+                    .Input("off_value", 3, DT_FLOAT)
+                    .Attr("axis", axis)
+                    .Finalize(&op.node_def));
+  };
+
+  // Invalid axis value.
+  set_axis(-2);
+  INFER_ERROR("axis must be >= -1", op, "?;?;?;?");
+  set_axis(1);
+
+  // If indices shape is unknown, we return an unknown shape.
+  INFER_OK(op, "?;[];?;?", "?");
+
+  // Depth must be scalar.
+  Tensor depth = test::AsTensor<int32>({1, 2});
+  op.input_tensors[1] = &depth;
+  INFER_ERROR("Input must be scalar but has rank 1", op, "?;[2];?;?");
+
+  // Full information is available.
+  depth = test::AsScalar<int32>(2);
+  INFER_OK(op, "[1,3,4];[];?;?", "[d0_0,2,d0_1,d0_2]");
+  set_axis(-1);
+  INFER_OK(op, "[1,3,4];[];?;?", "[d0_0,d0_1,d0_2,2]");
+}
+
+TEST(NNOpsTest, ExtractImagePatchesShapeTest) {
+  ShapeInferenceTestOp op("ExtractImagePatches");
+  auto set_op = [&op](const std::vector<int32>& ksizes,
+                      const std::vector<int32>& strides,
+                      const std::vector<int32>& rates, const string& padding) {
+    TF_CHECK_OK(NodeDefBuilder("test", "ExtractImagePatches")
+                    .Input("input", 0, DT_FLOAT)
+                    .Attr("ksizes", ksizes)
+                    .Attr("strides", strides)
+                    .Attr("rates", rates)
+                    .Attr("padding", padding)
+                    .Finalize(&op.node_def));
+  };
+
+  // Just tests that the ksize calculation with rates works.  Most of
+  // the other code is boilerplate that is tested by a variety of
+  // other ops.
+  //
+  // ksizes is 2x2.  rate rows and cols is 2, so ksize_rows and
+  // cols are changed to be 2 + (2 - 1) = 3.  7x7 input with 3x3
+  // filter and 1x1 stride gives a 5x5 output.
+  set_op({1, 2, 2, 1}, {1, 1, 1, 1}, {1, 2, 2, 1}, "VALID");
+  INFER_OK(op, "[1,7,7,2]", "[d0_0,5,5,d0_3]");
+
+  // Bad ksize rank
+  set_op({1, 2, 2, 1, 1}, {1, 1, 1, 1}, {1, 2, 2, 1}, "VALID");
+  INFER_ERROR(
+      "ExtractImagePatches requires the ksizes attribute to contain 4 values, "
+      "but got: 5",
+      op, "[1,7,7,2]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/compat/ops_history.v0.pbtxt b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
index 8b3230c3fed..5f40949b95c 100644
--- a/tensorflow/core/ops/compat/ops_history.v0.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v0.pbtxt
@@ -5246,6 +5246,148 @@ op {
     }
   }
 }
+op {
+  name: "BatchSelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 11
+  }
+}
+op {
+  name: "BatchSelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "BatchSvd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "BatchSvd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "BatchToSpace"
   input_arg {
@@ -7876,6 +8018,36 @@ op {
     }
   }
 }
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
 op {
   name: "DebugNanCount"
   input_arg {
@@ -7898,6 +8070,36 @@ op {
     }
   }
 }
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
 op {
   name: "DecodeCSV"
   input_arg {
@@ -20568,6 +20770,62 @@ op {
     }
   }
 }
+op {
+  name: "SelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 11
+  }
+}
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "SerializeManySparse"
   input_arg {
@@ -25123,6 +25381,92 @@ op {
     }
   }
 }
+op {
+  name: "Svd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "Svd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "Switch"
   input_arg {
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index 0c1b124f22f..c423c742209 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -15,18 +15,32 @@ limitations under the License.
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
 namespace tensorflow {
 
 using shape_inference::InferenceContext;
+using shape_inference::Shape;
 
 // --------------------------------------------------------------------------
+namespace {
+Status SwitchShape(InferenceContext* c) {
+  const Shape* unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+  const Shape* out = c->input(0);
+  c->set_output(0, out);
+  c->set_output(1, out);
+  return Status::OK();
+}
+}  // namespace
+
 REGISTER_OP("Switch")
     .Input("data: T")
     .Input("pred: bool")
     .Output("output_false: T")
     .Output("output_true: T")
     .Attr("T: type")
+    .SetShapeFn(SwitchShape)
     .Doc(R"doc(
 Forwards `data` to the output port determined by `pred`.
 
@@ -41,7 +55,6 @@ output_false: If `pred` is false, data will be forwarded to this output.
 output_true: If `pred` is true, data will be forwarded to this output.
 )doc");
 
-// --------------------------------------------------------------------------
 REGISTER_OP("RefSwitch")
     .Input("data: Ref(T)")
     .Input("pred: bool")
@@ -49,6 +62,7 @@ REGISTER_OP("RefSwitch")
     .Output("output_true: Ref(T)")
     .Attr("T: type")
     .SetAllowsUninitializedInput()
+    .SetShapeFn(SwitchShape)
     .Doc(R"doc(
 Forwards the ref tensor `data` to the output port determined by `pred`.
 
@@ -70,6 +84,26 @@ REGISTER_OP("RefSelect")
     .Output("output: Ref(T)")
     .Attr("T: type")
     .Attr("N: int >= 1")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      const Shape* first_input = c->input(1);
+      if (!c->FullyDefined(first_input)) {
+        c->set_output(0, c->UnknownShape());
+        return Status::OK();
+      }
+      // If any inputs aren't fully defined or don't match, we return unknown.
+      for (int i = 2; i < c->num_inputs(); ++i) {
+        const Shape* input = c->input(i);
+        if (!c->FullyDefined(input) ||
+            !c->Merge(first_input, input, &unused).ok()) {
+          c->set_output(0, c->UnknownShape());
+          return Status::OK();
+        }
+      }
+      c->set_output(0, first_input);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Forwards the `index`th element of `inputs` to `output`.
 
@@ -79,12 +113,40 @@ output: The forwarded tensor.
 )doc");
 
 // --------------------------------------------------------------------------
+namespace {
+Status MergeShape(InferenceContext* c) {
+  const Shape* out = c->input(0);
+  if (!c->RankKnown(out)) {
+    out = c->UnknownShape();
+  } else {
+    int32 rank = c->Rank(out);
+    for (int i = 1; i < c->num_inputs(); ++i) {
+      const Shape* input = c->input(i);
+      if (c->Rank(input) != rank) {
+        out = c->UnknownShape();
+        break;
+      }
+
+      for (int d = 0; d < rank; ++d) {
+        if (c->Value(c->Dim(input, d)) != c->Value(c->Dim(out, d))) {
+          TF_RETURN_IF_ERROR(c->ReplaceDim(out, d, c->UnknownDim(), &out));
+        }
+      }
+    }
+  }
+  c->set_output(0, out);
+  c->set_output(1, c->Scalar());
+  return Status::OK();
+}
+}  // namespace
+
 REGISTER_OP("Merge")
     .Input("inputs: N * T")
     .Output("output: T")
     .Output("value_index: int32")
     .Attr("T: type")
     .Attr("N: int >= 1")
+    .SetShapeFn(MergeShape)
     .Doc(R"doc(
 Forwards the value of an available tensor from `inputs` to `output`.
 
@@ -107,6 +169,7 @@ REGISTER_OP("RefMerge")
     .Output("value_index: int32")
     .Attr("T: type")
     .Attr("N: int >= 1")
+    .SetShapeFn(MergeShape)
     .Doc(R"doc(
 Forwards the value of an available tensor from `inputs` to `output`.
 
@@ -245,15 +308,17 @@ output: The same tensor as `input`.
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ControlTrigger")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Does nothing. Serves as a control trigger for scheduling. Only useful as a
-placeholder for control edges.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"docstring(
+Does nothing. Serves as a control trigger for scheduling.
+
+Only useful as a placeholder for control edges.
+)docstring");
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Abort")
     .Attr("error_msg: string = ''")
+    .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
 Raise a exception to abort the process when called.
 
diff --git a/tensorflow/core/ops/control_flow_ops_test.cc b/tensorflow/core/ops/control_flow_ops_test.cc
new file mode 100644
index 00000000000..9aa14e27a0a
--- /dev/null
+++ b/tensorflow/core/ops/control_flow_ops_test.cc
@@ -0,0 +1,79 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(ControlFlowOpsTest, Merge_ShapeFn) {
+  ShapeInferenceTestOp op("Merge");
+
+  int n = 3;
+  std::vector<NodeDefBuilder::NodeOut> src_list;
+  for (int i = 0; i < n; ++i) src_list.emplace_back("a", 0, DT_FLOAT);
+  TF_ASSERT_OK(NodeDefBuilder("test", "Merge")
+                   .Input(src_list)
+                   .Attr("N", n)
+                   .Finalize(&op.node_def));
+
+  // The second output should always be scalar.
+  // The first output should be unknown if any of the inputs are unknown, or
+  // if two inputs disagree about rank.
+  INFER_OK(op, "?;?;?", "?;[]");
+  INFER_OK(op, "[2,1];?;[2,1]", "?;[]");
+  INFER_OK(op, "[2,1];[2,1];?", "?;[]");
+  INFER_OK(op, "[2,1];[2,1];[3,1,2]", "?;[]");
+  // If inputs on rank, but disagree on specific dimensions, those dimensions
+  // should be unknown.
+  INFER_OK(op, "[2,1];[2,1];[3,1]", "[?,d0_1];[]");
+  INFER_OK(op, "[2,1];[2,2];[3,1]", "[?,?];[]");
+  // Otherwise, all inputs agree and we return the first input.
+  INFER_OK(op, "[2,1];[2,1];[2,1]", "in0;[]");
+}
+
+TEST(ControlFlowOpsTest, RefSelect_ShapeFn) {
+  ShapeInferenceTestOp op("RefSelect");
+
+  int n = 3;
+  std::vector<NodeDefBuilder::NodeOut> src_list;
+  for (int i = 0; i < n; ++i) src_list.emplace_back("a", 1, DT_FLOAT_REF);
+  TF_ASSERT_OK(NodeDefBuilder("test", "RefSelect")
+                   .Input("index", 0, DT_INT32)
+                   .Input(src_list)
+                   .Attr("N", n)
+                   .Finalize(&op.node_def));
+
+  // The first argument should be scalar.
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[2];?;?;?");
+
+  // If any inputs aren't fully defined, we return an unknown shape.
+  INFER_OK(op, "?;?;?;?", "?");
+  INFER_OK(op, "[];?;?;?", "?");
+  INFER_OK(op, "[];[1,2,3];?;?", "?");
+  INFER_OK(op, "[];[1,2,3];[1,2,?];[1,2,3]", "?");
+  // If inputs disagree on rank or dimension, we return an unknown shape.
+  INFER_OK(op, "[];[1,2,3];[1,2];[1,2,3]", "?");
+  INFER_OK(op, "[];[1,2,3];[1,2,4];[1,2,3]", "?");
+  // Otherwise, all inputs agree and we return the first input.
+  INFER_OK(op, "[];[1,2,3];[1,2,3];[1,2,3]", "in1");
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index 22f9f7a2564..8bd806af576 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -31,6 +32,40 @@ REGISTER_OP("DynamicPartition")
     .Output("outputs: num_partitions * T")
     .Attr("num_partitions: int")
     .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      int64 num_partitions;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_partitions", &num_partitions));
+
+      const Shape* data_shape = c->input(0);
+      const Shape* partitions_shape = c->input(1);
+
+      if (!c->RankKnown(partitions_shape)) {
+        return shape_inference::UnknownShape(c);
+      }
+
+      const int64 rank = c->Rank(partitions_shape);
+
+      // data shape must start with partitions_shape
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(
+          c->MergePrefix(data_shape, partitions_shape, &unused, &unused));
+
+      // The partition shape is dynamic in the 0th dimension, and matches
+      // data_shape in the remaining dimensions.
+      const Shape* unknown_dim0 = c->MakeShape({c->UnknownDim()});
+
+      const Shape* data_suffix_shape;
+      TF_RETURN_IF_ERROR(c->Subshape(data_shape, rank, &data_suffix_shape));
+      const Shape* result_shape;
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(unknown_dim0, data_suffix_shape, &result_shape));
+
+      for (int i = 0; i < c->num_outputs(); ++i) {
+        c->set_output(i, result_shape);
+      }
+
+      return Status::OK();
+    })
     .Doc(R"doc(
 Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 
@@ -76,6 +111,37 @@ REGISTER_OP("DynamicStitch")
     .Output("merged: T")
     .Attr("N : int >= 2")
     .Attr("T : type")
+    .SetShapeFn([](InferenceContext* c) {
+      int64 num_partitions;
+      TF_RETURN_IF_ERROR(c->GetAttr("N", &num_partitions));
+
+      const Shape* extra_shape = c->UnknownShape();
+      for (int i = 0; i < num_partitions; ++i) {
+        const Shape* indices_shape = c->input(i);
+        const Shape* data_shape = c->input(i + num_partitions);
+        if (!c->RankKnown(indices_shape)) {
+          continue;
+        }
+
+        const int64 indices_rank = c->Rank(indices_shape);
+
+        // Assert that data_shape starts with indices_shape.
+        const Shape* unused;
+        TF_RETURN_IF_ERROR(
+            c->MergePrefix(data_shape, indices_shape, &unused, &unused));
+
+        // The rest belongs to output.
+        const Shape* rest;
+        TF_RETURN_IF_ERROR(c->Subshape(data_shape, indices_rank, &rest));
+        TF_RETURN_IF_ERROR(c->Merge(extra_shape, rest, &extra_shape));
+      }
+
+      const Shape* output_shape = c->Vector(c->UnknownDim());
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(output_shape, extra_shape, &output_shape));
+      c->set_output(0, output_shape);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Interleave the values from the `data` tensors into a single tensor.
 
@@ -131,6 +197,7 @@ REGISTER_OP("RandomShuffleQueue")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 A queue that randomizes the order of elements.
 
@@ -162,6 +229,7 @@ REGISTER_OP("FIFOQueue")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 A queue that produces elements in first-in first-out order.
 
@@ -187,6 +255,7 @@ REGISTER_OP("PaddingFIFOQueue")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 A queue that produces elements in first-in first-out order.
 
@@ -220,6 +289,7 @@ REGISTER_OP("PriorityQueue")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 A queue that produces elements sorted by the first component value.
 
@@ -248,6 +318,7 @@ REGISTER_OP("QueueEnqueue")
     .Input("components: Tcomponents")
     .Attr("Tcomponents: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Enqueues a tuple of one or more tensors in the given queue.
 
@@ -269,6 +340,7 @@ REGISTER_OP("QueueEnqueueMany")
     .Input("components: Tcomponents")
     .Attr("Tcomponents: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Enqueues zero or more tuples of one or more tensors in the given queue.
 
@@ -295,6 +367,7 @@ REGISTER_OP("QueueDequeue")
     .Output("components: component_types")
     .Attr("component_types: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Dequeues a tuple of one or more tensors from the given queue.
 
@@ -319,6 +392,7 @@ REGISTER_OP("QueueDequeueMany")
     .Output("components: component_types")
     .Attr("component_types: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Dequeues n tuples of one or more tensors from the given queue.
 
@@ -351,6 +425,7 @@ REGISTER_OP("QueueDequeueUpTo")
     .Output("components: component_types")
     .Attr("component_types: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Dequeues n tuples of one or more tensors from the given queue.
 
@@ -401,6 +476,7 @@ cancel_pending_enqueues: If true, all pending enqueue requests that are
 REGISTER_OP("QueueSize")
     .Input("handle: Ref(string)")
     .Output("size: int32")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Computes the number of elements in the given queue.
 
@@ -415,6 +491,7 @@ REGISTER_OP("Stack")
     .Attr("elem_type: type")
     .Attr("stack_name: string = ''")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 A stack that produces elements in first-in last-out order.
 
@@ -430,6 +507,7 @@ REGISTER_OP("StackPush")
     .Output("output: T")
     .Attr("T: type")
     .Attr("swap_memory: bool = false")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Push an element onto the stack.
 
@@ -443,6 +521,7 @@ REGISTER_OP("StackPop")
     .Input("handle: Ref(string)")
     .Output("elem: elem_type")
     .Attr("elem_type: type")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Pop the element at the top of the stack.
 
@@ -451,9 +530,7 @@ elem: The tensor that is popped from the top of the stack.
 elem_type: The type of the elem that is popped.
 )doc");
 
-REGISTER_OP("StackClose")
-    .Input("handle: Ref(string)")
-    .Doc(R"doc(
+REGISTER_OP("StackClose").Input("handle: Ref(string)").Doc(R"doc(
 Delete the stack from its resource container.
 
 handle: The handle to a stack.
@@ -469,6 +546,12 @@ REGISTER_OP("TensorArray")
     .Attr("tensor_array_name: string = ''")
     .Output("handle: Ref(string)")
     .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      c->set_output(0, c->Vector(2));
+      return Status::OK();
+    })
     .Doc(R"doc(
 An array of Tensors of given size, with data written via Write and read
 via Read or Pack.
@@ -492,6 +575,14 @@ REGISTER_OP("TensorArrayGrad")
     .Output("grad_handle: Ref(string)")
     .Attr("source: string")
     .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      c->set_output(0, c->Vector(2));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Creates a TensorArray for storing the gradients of values in the given handle.
 
@@ -545,6 +636,15 @@ REGISTER_OP("TensorArrayWrite")
     .Input("flow_in: float")
     .Output("flow_out: float")
     .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    })
     .Doc(R"doc(
 Push an element onto the tensor_array.
 
@@ -561,6 +661,15 @@ REGISTER_OP("TensorArrayRead")
     .Input("flow_in: float")
     .Output("value: dtype")
     .Attr("dtype: type")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::UnknownShape(c);
+    })
     .Doc(R"doc(
 Read an element from the TensorArray into output `value`.
 
@@ -576,6 +685,14 @@ REGISTER_OP("TensorArrayPack")
     .Output("value: dtype")
     .Attr("dtype: type")
     .Attr("element_shape: shape = { unknown_rank: true }")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::UnknownShape(c);
+    })
     .Doc(R"doc(
 Pack the elements from the TensorArray into output `value`.
 
@@ -597,6 +714,14 @@ REGISTER_OP("TensorArrayUnpack")
     .Input("flow_in: float")
     .Output("flow_out: float")
     .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    })
     .Doc(R"doc(
 Unpack the data from the input value into TensorArray elements.
 
@@ -613,6 +738,16 @@ REGISTER_OP("TensorArrayConcat")
     .Output("lengths: int64")
     .Attr("dtype: type")
     .Attr("element_shape_except0: shape = { unknown_rank: true }")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      c->set_output(0, c->UnknownShape());
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Concat the elements from the TensorArray into value `value`.
 
@@ -649,6 +784,15 @@ REGISTER_OP("TensorArraySplit")
     .Input("flow_in: float")
     .Output("flow_out: float")
     .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    })
     .Doc(R"doc(
 Split the data from the input value into TensorArray elements.
 
@@ -682,6 +826,13 @@ REGISTER_OP("TensorArraySize")
     .Input("handle: Ref(string)")
     .Input("flow_in: float")
     .Output("size: int32")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      return shape_inference::ScalarShape(c);
+    })
     .Doc(R"doc(
 Get the current size of the TensorArray.
 
@@ -692,6 +843,13 @@ size: The current size of the TensorArray.
 
 REGISTER_OP("TensorArrayClose")
     .Input("handle: Ref(string)")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Delete the TensorArray from its resource container.  This enables
 the user to close and release the resource in the middle of a step/run.
@@ -709,6 +867,7 @@ REGISTER_OP("Barrier")
     .Attr("capacity: int = -1")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Defines a barrier that persists across different graph executions.
 
@@ -740,6 +899,16 @@ REGISTER_OP("BarrierInsertMany")
     .Input("values: T")
     .Attr("T: type")
     .Attr("component_index: int")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* keys = c->input(1);
+      const Shape* values = c->input(2);
+      const Shape* unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(keys, 1, &keys));
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 1, &values));
+      TF_RETURN_IF_ERROR(c->Merge(keys, c->Vector(c->Dim(values, 0)), &unused));
+      return Status::OK();
+    })
     .Doc(R"doc(
 For each key, assigns the respective value to the specified component.
 
@@ -765,6 +934,7 @@ REGISTER_OP("BarrierTakeMany")
     .Attr("allow_small_batch: bool = false")
     .Attr("wait_for_incomplete: bool = false")
     .Attr("timeout_ms: int = -1")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Takes the given number of completed elements from a barrier.
 
@@ -815,6 +985,7 @@ cancel_pending_enqueues: If true, all pending enqueue requests that are
 REGISTER_OP("BarrierReadySize")
     .Input("handle: Ref(string)")
     .Output("size: int32")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Computes the number of complete elements in the given barrier.
 
@@ -826,6 +997,7 @@ size: The number of complete elements (i.e. those with all of their value
 REGISTER_OP("BarrierIncompleteSize")
     .Input("handle: Ref(string)")
     .Output("size: int32")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Computes the number of incomplete elements in the given barrier.
 
@@ -936,6 +1108,7 @@ REGISTER_OP("HashTable")
     .Attr("key_dtype: type")
     .Attr("value_dtype: type")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Creates a non-initialized hash table.
 
@@ -959,6 +1132,7 @@ REGISTER_OP("MutableHashTable")
     .Attr("key_dtype: type")
     .Attr("value_dtype: type")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Creates an empty hash table.
 
@@ -983,6 +1157,7 @@ REGISTER_OP("MutableHashTableOfTensors")
     .Attr("value_dtype: type")
     .Attr("value_shape: shape = {}")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Creates an empty hash table.
 
@@ -1061,6 +1236,7 @@ REGISTER_OP("GetSessionHandle")
     .Input("value: T")
     .Output("handle: string")
     .Attr("T: type")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Store the input tensor in the state of the current session.
 
@@ -1072,6 +1248,7 @@ REGISTER_OP("GetSessionTensor")
     .Input("handle: string")
     .Output("value: dtype")
     .Attr("dtype: type")
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Get the value of the tensor specified by its handle.
 
@@ -1080,9 +1257,7 @@ value: The tensor for the given handle.
 dtype: The type of the output value.
 )doc");
 
-REGISTER_OP("DeleteSessionTensor")
-    .Input("handle: string")
-    .Doc(R"doc(
+REGISTER_OP("DeleteSessionTensor").Input("handle: string").Doc(R"doc(
 Delete the tensor specified by its handle in the session.
 
 handle: The handle for a tensor stored in the session state.
diff --git a/tensorflow/core/ops/data_flow_ops_test.cc b/tensorflow/core/ops/data_flow_ops_test.cc
index e1f815a2520..d00c989f4b1 100644
--- a/tensorflow/core/ops/data_flow_ops_test.cc
+++ b/tensorflow/core/ops/data_flow_ops_test.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -90,4 +92,52 @@ TEST(MathOpsTest, InitializeTableFromTextFile) {
   INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[];[1]");
 }
 
+TEST(MathOpsTest, DynamicPartition) {
+  ShapeInferenceTestOp op("DynamicPartition");
+  TF_ASSERT_OK(NodeDefBuilder("test", "DynamicPartition")
+                   .Input("data", 0, DT_FLOAT_REF)
+                   .Input("indices", 0, DT_INT32)
+                   .Attr("num_partitions", 4)
+                   .Finalize(&op.node_def));
+
+  // Unknown rank for indices, so unknown shape.
+  INFER_OK(op, "?;?", "?;?;?;?");
+
+  // 3 dimensional data, 2 dimensional indices.
+  INFER_OK(op, "[3,4,5];[3,4]", "[?,d0_2];[?,d0_2];[?,d0_2];[?,d0_2]");
+
+  TF_ASSERT_OK(NodeDefBuilder("test", "DynamicPartition")
+                   .Input("data", 0, DT_FLOAT)
+                   .Input("indices", 0, DT_INT32)
+                   .Attr("num_partitions", 2)
+                   .Finalize(&op.node_def));
+
+  // Suffix after matching prefix is copied over.
+  INFER_OK(op, "[3,4,5,6];[3,4]", "[?,d0_2,d0_3];[?,d0_2,d0_3]");
+
+  // Does not start with proper prefix
+  INFER_ERROR("Dimensions must be equal, but are 4 and 100", op,
+              "[3,4,5];[3,100]");
+}
+
+TEST(MathOpsTest, DynamicStitch) {
+  ShapeInferenceTestOp op("DynamicStitch");
+  TF_ASSERT_OK(
+      NodeDefBuilder("test", "DynamicStitch")
+          .Input({{"indices", 0, DT_INT32}, {"indices_2", 1, DT_INT32}})
+          .Input({{"data", 0, DT_FLOAT}, {"data_2", 1, DT_FLOAT}})
+          .Attr("N", 2)
+          .Finalize(&op.node_def));
+
+  INFER_OK(op, "[2,3];[5,6];[2,3,4,5];[5,6,4,5]", "[?,d2_2,d2_3]");
+
+  // Bad prefix for the second data input.
+  INFER_ERROR("Dimensions must be equal, but are 10 and 5", op,
+              "[2,3];[5,6];[2,3,4,5];[10,11,4,5]");
+
+  // Inconsistent suffix dimensions
+  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 4 and 13", op,
+              "[2,3];[5,6];[2,3,4,5];[5,6,13,14]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 18869205971..5a55493517b 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -177,6 +177,10 @@ REGISTER_OP("ResizeBilinearGrad")
     .Output("output: T")
     .Attr("T: {float, half, double}")
     .Attr("align_corners: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(1));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes the gradient of bilinear interpolation.
 
@@ -219,6 +223,27 @@ REGISTER_OP("ResizeNearestNeighborGrad")
     .Output("output: T")
     .Attr("T: {uint8, int8, int32, half, float, double}")
     .Attr("align_corners: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+      const Shape* unused;
+      const Dimension* unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(unused, 0), 2, &unused_dim));
+      const Tensor* size = c->input_tensor(1);
+      if (size == nullptr) {
+        TF_RETURN_IF_ERROR(c->ReplaceDim(input, 1, c->UnknownDim(), &input));
+        TF_RETURN_IF_ERROR(c->ReplaceDim(input, 2, c->UnknownDim(), &input));
+      } else {
+        auto size_vec = size->vec<int32>();
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(input, 1, c->MakeDim(size_vec(0)), &input));
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(input, 2, c->MakeDim(size_vec(1)), &input));
+      }
+      c->set_output(0, input);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes the gradient of nearest neighbor interpolation.
 
@@ -771,6 +796,13 @@ REGISTER_OP("CropAndResizeGradImage")
     .Output("output: T")
     .Attr("T: {float, half, double}")
     .Attr("method: {'bilinear'} = 'bilinear'")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(3, &out));
+      TF_RETURN_IF_ERROR(c->WithRank(out, 4, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes the gradient of the crop_and_resize op wrt the input image tensor.
 
@@ -803,6 +835,10 @@ REGISTER_OP("CropAndResizeGradBoxes")
     .Output("output: float")
     .Attr("T: {uint8, int8, int16, int32, int64, half, float, double}")
     .Attr("method: {'bilinear'} = 'bilinear'")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(2));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
 
@@ -834,6 +870,10 @@ REGISTER_OP("NonMaxSuppression")
     .Input("max_output_size: int32")
     .Output("selected_indices: int32")
     .Attr("iou_threshold: float = 0.5")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Greedily selects a subset of bounding boxes in descending order of score,
 pruning away boxes that have high intersection-over-union (IOU) overlap
diff --git a/tensorflow/core/ops/image_ops_test.cc b/tensorflow/core/ops/image_ops_test.cc
index 3cb33fe889b..fc9640ffb40 100644
--- a/tensorflow/core/ops/image_ops_test.cc
+++ b/tensorflow/core/ops/image_ops_test.cc
@@ -160,4 +160,37 @@ TEST(ImageOpsTest, CropAndResize_ShapeFn) {
   INFER_ERROR("Dimension must be 4 but is 3", op, "?;[?,3];?;?");
 }
 
+TEST(ImageOpsTest, ResizeNearestNeighborGrad_ShapeFn) {
+  ShapeInferenceTestOp op("ResizeNearestNeighborGrad");
+  op.input_tensors.resize(2);
+
+  // Rank and size checks.
+  INFER_ERROR("Shape must be rank 4 but is rank 3", op, "[1,2,3];?");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "?;[1,2]")
+  INFER_ERROR("Dimension must be 2 but is 1", op, "?;[1]");
+
+  // When the size tensor is not a constant, the middle dims are unknown.
+  INFER_OK(op, "[1,?,3,?];[2]", "[d0_0,?,?,d0_3]");
+
+  Tensor size_tensor = test::AsTensor<int32>({20, 30});
+  op.input_tensors[1] = &size_tensor;
+  INFER_OK(op, "[1,?,3,?];[2]", "[d0_0,20,30,d0_3]");
+}
+
+TEST(ImageOpsTest, CropAndResizeGradImage_ShapeFn) {
+  ShapeInferenceTestOp op("CropAndResizeGradImage");
+  op.input_tensors.resize(4);
+
+  // Rank checks.
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "?;?;?;[1,2]");
+
+  // Unknown image_size should result in output of rank 4 with unknown dims.
+  INFER_OK(op, "?;?;?;?", "[?,?,?,?]");
+
+  // Known image_size should result in full shape information.
+  Tensor image_size = test::AsTensor<int32>({10, 20, 30, 40});
+  op.input_tensors[3] = &image_size;
+  INFER_OK(op, "?;?;?;[1]", "[10, 20, 30, 40]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/io_ops.cc b/tensorflow/core/ops/io_ops.cc
index 8c27cbb1eab..1d528660cfa 100644
--- a/tensorflow/core/ops/io_ops.cc
+++ b/tensorflow/core/ops/io_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
@@ -237,6 +238,7 @@ REGISTER_OP("WholeFileReader")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 A Reader that outputs the entire contents of a file as a value.
 
@@ -256,6 +258,7 @@ REGISTER_OP("TextLineReader")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 A Reader that outputs the lines of a file delimited by '\n'.
 
@@ -275,6 +278,7 @@ REGISTER_OP("FixedLengthRecordReader")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 A Reader that outputs fixed-length records from a file.
 
@@ -291,6 +295,7 @@ REGISTER_OP("TFRecordReader")
     .Attr("shared_name: string = ''")
     .Attr("compression_type: string = ''")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 A Reader that outputs the records from a TensorFlow Records file.
 
@@ -306,6 +311,7 @@ REGISTER_OP("IdentityReader")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 A Reader that outputs the queued work as both the key and value.
 
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index ab4b2644b24..54b8e22b7ee 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -115,6 +115,111 @@ Status BatchMatrixSolveShapeFn(InferenceContext* c, bool square) {
   return Status::OK();
 }
 
+Status BatchSvdShapeHelperFn(InferenceContext* c, const Shape* input) {
+  const Dimension* m = c->Dim(input, -2);
+  const Dimension* n = c->Dim(input, -1);
+  const Dimension* p;
+  TF_RETURN_IF_ERROR(c->Min(m, n, &p));
+  const Shape* batch_shape;
+  TF_RETURN_IF_ERROR(c->Subshape(input, 0, -2, &batch_shape));
+  const Shape* e_shape;
+  TF_RETURN_IF_ERROR(c->Concatenate(batch_shape, c->Vector(p), &e_shape));
+  c->set_output(0, e_shape);
+  bool compute_uv;
+  TF_RETURN_IF_ERROR(c->GetAttr("compute_uv", &compute_uv));
+  if (compute_uv) {
+    const Shape* u_shape;
+    const Shape* v_shape;
+    bool full_matrices;
+    TF_RETURN_IF_ERROR(c->GetAttr("full_matrices", &full_matrices));
+    if (full_matrices) {
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(batch_shape, c->Matrix(m, m), &u_shape));
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(batch_shape, c->Matrix(n, n), &v_shape));
+    } else {
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(batch_shape, c->Matrix(m, p), &u_shape));
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(batch_shape, c->Matrix(n, p), &v_shape));
+    }
+    c->set_output(1, u_shape);
+    c->set_output(2, v_shape);
+  } else {
+    c->set_output(1, c->Vector(0ll));
+    c->set_output(2, c->Vector(0ll));
+  }
+  return Status::OK();
+}
+
+// Input is [M,N].  First output is [min(M,N)].
+// Second and third outputs are:
+//   [0]; [0], if compute_uv is false.
+//   [M,M]; [N,N], if compute_uv is true and full_matrices is true,
+//   [M,P]; [N,P], if compute_uv is true and full_matrices is false,
+// where P = min(M,N).
+Status SvdShapeFn(InferenceContext* c) {
+  const Shape* input;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+  return BatchSvdShapeHelperFn(c, input);
+}
+
+// Input is [...,M,N].  First output is [...,min(M,N)].
+// Second and third outputs are:
+//   [0]; [0], if compute_uv is false.
+//   [...,M,M]; [...,N,N], if compute_uv is true and full_matrices is true,
+//   [...,M,P]; [...,N,P], if compute_uv is true and full_matrices is false,
+// where P = min(M,N).
+Status BatchSvdShapeFn(InferenceContext* c) {
+  const Shape* input;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &input));
+  return BatchSvdShapeHelperFn(c, input);
+}
+
+// Input is [N,N]. Outputs are:
+//   [N];[0], if compute_v is false,
+//   [N];[N,N], if compute_v is true.
+Status SelfAdjointEigV2ShapeFn(InferenceContext* c) {
+  const Shape* input;
+  TF_RETURN_IF_ERROR(MakeSquareMatrix(c, c->input(0), &input));
+  const Dimension* n;
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(input, 0), c->Dim(input, 1), &n));
+  c->set_output(0, c->Vector(n));
+  bool compute_v;
+  TF_RETURN_IF_ERROR(c->GetAttr("compute_v", &compute_v));
+  if (compute_v) {
+    c->set_output(1, c->Matrix(n, n));
+  } else {
+    c->set_output(1, c->Vector(0ll));
+  }
+  return Status::OK();
+}
+
+// Input is [...,N,N]. Outputs are:
+//   [...,N];[0], if compute_v is false,
+//   [...,N];[...,N,N], if compute_v is true.
+Status BatchSelfAdjointEigV2ShapeFn(InferenceContext* c) {
+  const Shape* input;
+  TF_RETURN_IF_ERROR(MakeBatchSquareMatrix(c, c->input(0), &input));
+  const Dimension* n;
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(input, -2), c->Dim(input, -1), &n));
+  const Shape* batch_shape;
+  TF_RETURN_IF_ERROR(c->Subshape(input, 0, -2, &batch_shape));
+  const Shape* e_shape;
+  TF_RETURN_IF_ERROR(c->Concatenate(batch_shape, c->Vector(n), &e_shape));
+  c->set_output(0, e_shape);
+  bool compute_v;
+  TF_RETURN_IF_ERROR(c->GetAttr("compute_v", &compute_v));
+  if (compute_v) {
+    const Shape* v_shape;
+    TF_RETURN_IF_ERROR(c->Concatenate(batch_shape, c->Matrix(n, n), &v_shape));
+    c->set_output(1, v_shape);
+  } else {
+    c->set_output(1, c->Vector(0ll));
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 REGISTER_OP("MatrixDeterminant")
@@ -128,7 +233,7 @@ REGISTER_OP("MatrixDeterminant")
       return Status::OK();
     })
     .Doc(R"doc(
-Calculates the determinant of a square matrix.
+Computes the determinant of a square matrix.
 
 input: A tensor of shape `[M, M]`.
 output: A scalar, equal to the determinant of the input.
@@ -152,7 +257,7 @@ REGISTER_OP("BatchMatrixDeterminant")
       return Status::OK();
     })
     .Doc(R"doc(
-Calculates the determinants for a batch of square matrices.
+Computes the determinants for a batch of square matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices. The output is a tensor containing the determinants
@@ -169,7 +274,7 @@ REGISTER_OP("MatrixInverse")
     .Attr("T: {double, float}")
     .SetShapeFn(UnchangedSquareShapeFn)
     .Doc(R"doc(
-Calculates the inverse of a square invertible matrix or its adjoint (conjugate
+Computes the inverse of a square invertible matrix or its adjoint (conjugate
 transpose).
 
 The op uses LU decomposition with partial pivoting to compute the inverse.
@@ -191,7 +296,7 @@ REGISTER_OP("BatchMatrixInverse")
     .Attr("T: {double, float}")
     .SetShapeFn(BatchUnchangedSquareShapeFn)
     .Doc(R"doc(
-Calculates the inverse of square invertible matrices or their adjoints
+Computes the inverse of square invertible matrices or their adjoints
 (conjugate transposes).
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
@@ -214,7 +319,7 @@ REGISTER_OP("Cholesky")
     .Attr("T: {double, float}")
     .SetShapeFn(UnchangedSquareShapeFn)
     .Doc(R"doc(
-Calculates the Cholesky decomposition of a square matrix.
+Computes the Cholesky decomposition of a square matrix.
 
 The input has to be symmetric and positive definite. Only the lower-triangular
 part of the input will be used for this operation. The upper-triangular part
@@ -233,7 +338,7 @@ REGISTER_OP("BatchCholesky")
     .Attr("T: {double, float}")
     .SetShapeFn(BatchUnchangedSquareShapeFn)
     .Doc(R"doc(
-Calculates the Cholesky decomposition of a batch of square matrices.
+Computes the Cholesky decomposition of a batch of square matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices, with the same constraints as the single matrix Cholesky
@@ -251,16 +356,16 @@ REGISTER_OP("CholeskyGrad")
     .Attr("T: {float, double}")
     .SetShapeFn(UnchangedSquareShapeFn)
     .Doc(R"doc(
-Calculates the reverse mode backpropagated gradient of the Cholesky algorithm.
+Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 
 For an explanation see "Differentiation of the Cholesky algorithm" by
 Iain Murray http://arxiv.org/abs/1602.07527.
 
 l: Output of Cholesky algorithm l = chol(A). Shape is `[M, M]`.
   Algorithm depends only on lower triangular part of this matrix.
-grad: df/dl where f is some scalar function. Shape is `[M, M]'.
+grad: df/dl where f is some scalar function. Shape is `[M, M]`.
   Algorithm depends only on lower triangular part of this matrix.
-output: Symmetrized version of df/dA . Shape is `[M, M]'.
+output: Symmetrized version of df/dA . Shape is `[M, M]`.
 )doc");
 
 REGISTER_OP("BatchCholeskyGrad")
@@ -270,7 +375,7 @@ REGISTER_OP("BatchCholeskyGrad")
     .Attr("T: {float, double}")
     .SetShapeFn(BatchUnchangedSquareShapeFn)
     .Doc(R"doc(
-Calculates the reverse mode backpropagated gradient of the Cholesky algorithm.
+Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 
 For an explanation see "Differentiation of the Cholesky algorithm" by
 Iain Murray http://arxiv.org/abs/1602.07527.
@@ -278,16 +383,17 @@ Iain Murray http://arxiv.org/abs/1602.07527.
 l: Output of batch Cholesky algorithm l = batch_cholesky(A). Shape is `[..., M, M]`.
   Algorithm depends only on lower triangular part of the innermost matrices of
   this tensor.
-grad: df/dl where f is some scalar function. Shape is `[..., M, M]'.
+grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
   Algorithm depends only on lower triangular part of the innermost matrices of
   this tensor.
-output: Symmetrized version of df/dA . Shape is `[..., M, M]'
+output: Symmetrized version of df/dA . Shape is `[..., M, M]`
 )doc");
 
 REGISTER_OP("SelfAdjointEig")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: {double, float}")
+    .Deprecated(11, "Use SelfAdjointEigV2 instead.")
     .SetShapeFn([](InferenceContext* c) {
       const Shape* input;
       TF_RETURN_IF_ERROR(MakeSquareMatrix(c, c->input(0), &input));
@@ -299,7 +405,7 @@ REGISTER_OP("SelfAdjointEig")
       return Status::OK();
     })
     .Doc(R"doc(
-Calculates the Eigen Decomposition of a square Self-Adjoint matrix.
+Computes the Eigen Decomposition of a square Self-Adjoint matrix.
 
 Only the lower-triangular part of the input will be used in this case. The
 upper-triangular part will not be read.
@@ -315,6 +421,7 @@ REGISTER_OP("BatchSelfAdjointEig")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: {double, float}")
+    .Deprecated(11, "Use BatchSelfAdjointEigV2 instead.")
     .SetShapeFn([](InferenceContext* c) {
       const Shape* input;
       TF_RETURN_IF_ERROR(MakeBatchSquareMatrix(c, c->input(0), &input));
@@ -330,19 +437,75 @@ REGISTER_OP("BatchSelfAdjointEig")
       return Status::OK();
     })
     .Doc(R"doc(
-Calculates the Eigen Decomposition of a batch of square self-adjoint matrices.
+Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices, with the same constraints as the single matrix
 SelfAdjointEig.
 
-The result is a '[..., M+1, M] matrix with [..., 0,:] containing the
+The result is a [..., M+1, M] matrix with [..., 0,:] containing the
 eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
 
 input: Shape is `[..., M, M]`.
 output: Shape is `[..., M+1, M]`.
 )doc");
 
+REGISTER_OP("SelfAdjointEigV2")
+    .Input("input: T")
+    .Output("e: T")
+    .Output("v: T")
+    .Attr("compute_v: bool = True")
+    .Attr("T: {double, float}")
+    .SetShapeFn(SelfAdjointEigV2ShapeFn)
+    .Doc(R"doc(
+Computes the eigen decomposition of a self-adjoint (\"symmetric\") matrix.
+
+Computes the eigenvalues and (optionally) eigenvectors such that
+`input = v * diag(e)`.
+
+```prettyprint
+# a is a self-adjoint matrix.
+# e is a vector of eigenvalues.
+# v is a matrix of eigenvectors.
+e, v = self_adjoint_eig(a)
+e = self_adjoint_eig(a, compute_v=False)
+```
+
+input: `Tensor` input of shape `[N, N]`.
+compute_v: If `True` then eigenvectors will be computed and returned in `v`.
+  Otherwise, only the eigenvalues will be computed.
+e: Eigenvalues. Shape is `[N]`.
+v: Eigenvectors. Shape is `[N, N]`.
+)doc");
+
+REGISTER_OP("BatchSelfAdjointEigV2")
+    .Input("input: T")
+    .Output("e: T")
+    .Output("v: T")
+    .Attr("compute_v: bool = True")
+    .Attr("T: {double, float}")
+    .SetShapeFn(BatchSelfAdjointEigV2ShapeFn)
+    .Doc(R"doc(
+Computes the eigen decomposition of a batch of square self-adjoint matrices.
+
+Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+
+```prettyprint
+# a is a tensor.
+# e is a tensor of eigenvalues.
+# v is a tensor of eigenvectors.
+e, v = batch_self_adjoint_eig(a)
+e = batch_self_adjoint_eig(a, compute_v=False)
+```
+
+input: `Tensor` input of shape `[N, N]`.
+compute_v: If `True` then eigenvectors will be computed and returned in `v`.
+  Otherwise, only the eigenvalues will be computed.
+e: Eigenvalues. Shape is `[N]`.
+v: Eigenvectors. Shape is `[N, N]`.
+)doc");
+
 REGISTER_OP("MatrixSolve")
     .Input("matrix: T")
     .Input("rhs: T")
@@ -526,10 +689,10 @@ REGISTER_OP("BatchMatrixSolveLs")
 Solves multiple linear least-squares problems.
 
 `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-form square matrices. Rhs is a tensor of shape `[..., M, K]`. The output
-is a tensor shape `[..., N, K]` where each output matrix solves each of
-the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :] in the
-least squares sense.
+form matrices of size `[M, N]`. Rhs is a tensor of shape `[..., M, K]`.
+The output is a tensor shape `[..., N, K]` where each output matrix solves
+each of the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]
+in the least squares sense.
 
 Below we will use the following notation for each pair of
 matrix and right-hand sides in the batch:
@@ -563,4 +726,84 @@ rhs: Shape is `[..., M, K]`.
 output: Shape is `[..., N, K]`.
 )doc");
 
+REGISTER_OP("Svd")
+    .Input("input: T")
+    .Output("s: T")
+    .Output("u: T")
+    .Output("v: T")
+    .Attr("compute_uv: bool = True")
+    .Attr("full_matrices: bool = False")
+    .Attr("T: {double, float}")
+    .SetShapeFn(SvdShapeFn)
+    .Doc(R"doc(
+Computes the singular value decomposition of a matrix.
+
+Computes the SVD of if `input` such that `input = u * diag(s) * transpose(v)`
+
+```prettyprint
+# a is a matrix.
+# s is a vector of singular values.
+# u is the matrix of left singular vectors.
+# v is a matrix of right singular vectors.
+s, u, v = svd(a)
+s, _, _ = svd(a, compute_uv=False)
+```
+
+input: Shape is `[M, N]`. Let `P` be the minimum of `M` and `N`.
+s: Singular values. Shape is `[P]`.
+u: Left singular vectors; if `full_matrices` is `False` then shape is `[M, M]`.
+  If `full_matrices` is `True` then shape is `[M, P]`.
+  Undefined if `compute_uv` is `False`.
+v: Left singular vectors. If `full_matrices` is `False` then shape is `[N, N]`.
+  If `full_matrices` is `True` then shape is `[N, P]`.
+  Undefined if `compute_uv` is false.
+compute_uv: If true, left and right singular vectors will be
+  computed and returned in `u` and `v`, respectively.
+  If false, `u` and `v` are not set and should never referenced.
+full_matrices: If true, compute full-sized `u` and `v`. If false
+  (the default), compute only the leading `P` singular vectors.
+  Ignored if `compute_uv` is `False`.
+)doc");
+
+REGISTER_OP("BatchSvd")
+    .Input("input: T")
+    .Output("s: T")
+    .Output("u: T")
+    .Output("v: T")
+    .Attr("compute_uv: bool = True")
+    .Attr("full_matrices: bool = False")
+    .Attr("T: {double, float}")
+    .SetShapeFn(BatchSvdShapeFn)
+    .Doc(R"doc(
+Computes the singular value decompositions of a batch of matrices.
+
+Computes the SVD of each inner matrix in `input` such that
+`input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+
+```prettyprint
+# a is a tensor containing a batch of matrices.
+# s is a tensor of singular values for each matrix.
+# u is the tensor containing of left singular vectors for each matrix.
+# v is the tensor containing of right singular vectors for each matrix.
+s, u, v = batch_svd(a)
+s, _, _ = batch_svd(a, compute_uv=False)
+```
+
+input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+  form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+s: Singular values. Shape is `[..., P]`.
+u: Left singular vectors. If `full_matrices` is `False` then shape is
+  `[..., M, M]`; if `full_matrices` is `True` then shape is
+  `[..., M, P]`. Undefined if `compute_uv` is `False`.
+v: Left singular vectors. If `full_matrices` is `False` then shape is
+  `[..., N, N]`. If `full_matrices` is `True` then shape is `[..., N, P]`.
+  Undefined if `compute_uv` is false.
+compute_uv: If true, left and right singular vectors will be
+  computed and returned in `u` and `v`, respectively.
+  If false, `u` and `v` are not set and should never referenced.
+full_matrices: If true, compute full-sized `u` and `v`. If false
+  (the default), compute only the leading `P` singular vectors.
+  Ignored if `compute_uv` is `False`.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/linalg_ops_test.cc b/tensorflow/core/ops/linalg_ops_test.cc
index 84e888bb9c9..6414db13a41 100644
--- a/tensorflow/core/ops/linalg_ops_test.cc
+++ b/tensorflow/core/ops/linalg_ops_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 #include "tensorflow/core/platform/test.h"
@@ -112,6 +113,70 @@ TEST(LinalgOpsTest, BatchSelfAdjointEig_ShapeFn) {
   INFER_OK(op, "[5,?,7,?,1]", "[d0_0,d0_1,d0_2,2,d0_4]");
 }
 
+TEST(LinalgOpsTest, SelfAdjointEigV2_ShapeFn) {
+  ShapeInferenceTestOp op("SelfAdjointEigV2");
+  auto set_compute_v = [&op](bool compute_v) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Pack")
+                    .Input({{"input", 0, DT_FLOAT}})
+                    .Attr("compute_v", compute_v)
+                    .Finalize(&op.node_def));
+  };
+  set_compute_v(false);
+  INFER_OK(op, "?", "[?];[0]");
+  INFER_OK(op, "[?,?]", "[d0_0|d0_1];[0]");
+  INFER_OK(op, "[1,?]", "[d0_0|d0_1];[0]");
+  INFER_OK(op, "[?,1]", "[d0_0|d0_1];[0]");
+  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Dimensions must be equal, but are 1 and 2", op, "[1,2]");
+
+  set_compute_v(true);
+  INFER_OK(op, "?", "[?];[?,?]");
+  INFER_OK(op, "[?,?]", "[d0_0|d0_1];[d0_0|d0_1,d0_0|d0_1]");
+  INFER_OK(op, "[1,?]", "[d0_0|d0_1];[d0_0|d0_1,d0_0|d0_1]");
+  INFER_OK(op, "[?,1]", "[d0_0|d0_1];[d0_0|d0_1,d0_0|d0_1]");
+  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Dimensions must be equal, but are 1 and 2", op, "[1,2]");
+}
+
+TEST(LinalgOpsTest, BatchSelfAdjointEigV2_ShapeFn) {
+  ShapeInferenceTestOp op("BatchSelfAdjointEigV2");
+  auto set_compute_v = [&op](bool compute_v) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Pack")
+                    .Input({{"input", 0, DT_FLOAT}})
+                    .Attr("compute_v", compute_v)
+                    .Finalize(&op.node_def));
+  };
+
+  set_compute_v(false);
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Dimensions must be equal, but are 1 and 2", op, "[1,2]");
+  INFER_ERROR("Dimensions must be equal, but are 1 and 2", op, "[3,1,2]");
+
+  INFER_OK(op, "?", "?;[0]");
+  INFER_OK(op, "[?,?]", "[d0_0|d0_1];[0]");
+  INFER_OK(op, "[1,?]", "[d0_0|d0_1];[0]");
+  INFER_OK(op, "[?,1]", "[d0_0|d0_1];[0]");
+
+  // Repeat previous block of tests with input rank > 2.
+  INFER_OK(op, "[5,?,7,?,?]", "[d0_0,d0_1,d0_2,d0_3|d0_4];[0]");
+  INFER_OK(op, "[5,?,7,1,?]", "[d0_0,d0_1,d0_2,d0_3|d0_4];[0]");
+  INFER_OK(op, "[5,?,7,?,1]", "[d0_0,d0_1,d0_2,d0_3|d0_4];[0]");
+
+  set_compute_v(true);
+  INFER_OK(op, "?", "?;?");
+  INFER_OK(op, "[?,?]", "[d0_0|d0_1];[d0_0|d0_1,d0_0|d0_1]");
+  INFER_OK(op, "[1,?]", "[d0_0|d0_1];[d0_0|d0_1,d0_0|d0_1]");
+  INFER_OK(op, "[?,1]", "[d0_0|d0_1];[d0_0|d0_1,d0_0|d0_1]");
+
+  // Repeat previous block of tests with input rank > 2.
+  INFER_OK(op, "[5,?,7,?,?]",
+           "[d0_0,d0_1,d0_2,d0_3|d0_4];[d0_0,d0_1,d0_2,d0_3|d0_4,d0_3|d0_4]");
+  INFER_OK(op, "[5,?,7,1,?]",
+           "[d0_0,d0_1,d0_2,d0_3|d0_4];[d0_0,d0_1,d0_2,d0_3|d0_4,d0_3|d0_4]");
+  INFER_OK(op, "[5,?,7,?,1]",
+           "[d0_0,d0_1,d0_2,d0_3|d0_4];[d0_0,d0_1,d0_2,d0_3|d0_4,d0_3|d0_4]");
+}
+
 TEST(LinalgOpsTest, SquareMatrixSolve_ShapeFn) {
   for (const char* op_name : {"MatrixSolve", "MatrixTriangularSolve"}) {
     ShapeInferenceTestOp op(op_name);
@@ -200,4 +265,100 @@ TEST(LinalgOpsTest, BatchMatrixSolveLs_ShapeFn) {
   INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "?;[1];?");
 }
 
+TEST(LinalgOpsTest, Svd_ShapeFn) {
+  ShapeInferenceTestOp op("Svd");
+  auto set_attrs = [&op](bool compute_uv, bool full_matrices) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Svd")
+                    .Input({"input", 0, DT_FLOAT})
+                    .Attr("compute_uv", compute_uv)
+                    .Attr("full_matrices", full_matrices)
+                    .Finalize(&op.node_def));
+  };
+
+  set_attrs(false, false);
+  INFER_OK(op, "?", "[?];[0];[0]");
+  INFER_OK(op, "[?,?]", "[?];[0];[0]");
+  INFER_OK(op, "[2,?]", "[?];[0];[0]");
+  INFER_OK(op, "[?,2]", "[?];[0];[0]");
+  INFER_OK(op, "[2,2]", "[d0_0];[0];[0]");
+  INFER_OK(op, "[3,2]", "[d0_1];[0];[0]");
+  INFER_OK(op, "[2,3]", "[d0_0];[0];[0]");
+  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Shape must be rank 2 but is rank 3", op, "[1,2,3]");
+
+  set_attrs(true, false);
+  INFER_OK(op, "?", "[?];[?,?];[?,?]");
+  INFER_OK(op, "[?,?]", "[?];[d0_0,?];[d0_1,?]");
+  INFER_OK(op, "[2,?]", "[?];[d0_0,?];[d0_1,?]");
+  INFER_OK(op, "[?,2]", "[?];[d0_0,?];[d0_1,?]");
+  INFER_OK(op, "[2,2]", "[d0_0];[d0_0,d0_0];[d0_1,d0_0]");
+  INFER_OK(op, "[3,2]", "[d0_1];[d0_0,d0_1];[d0_1,d0_1]");
+  INFER_OK(op, "[2,3]", "[d0_0];[d0_0,d0_0];[d0_1,d0_0]");
+  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Shape must be rank 2 but is rank 3", op, "[1,2,3]");
+
+  set_attrs(true, true);
+  INFER_OK(op, "?", "[?];[?,?];[?,?]");
+  INFER_OK(op, "[?,?]", "[?];[d0_0,d0_0];[d0_1,d0_1]");
+  INFER_OK(op, "[2,?]", "[?];[d0_0,d0_0];[d0_1,d0_1]");
+  INFER_OK(op, "[?,2]", "[?];[d0_0,d0_0];[d0_1,d0_1]");
+  INFER_OK(op, "[2,2]", "[d0_0];[d0_0,d0_0];[d0_1,d0_1]");
+  INFER_OK(op, "[3,2]", "[d0_1];[d0_0,d0_0];[d0_1,d0_1]");
+  INFER_OK(op, "[2,3]", "[d0_0];[d0_0,d0_0];[d0_1,d0_1]");
+  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Shape must be rank 2 but is rank 3", op, "[1,2,3]");
+}
+
+TEST(LinalgOpsTest, BatchSvd_ShapeFn) {
+  ShapeInferenceTestOp op("BatchSvd");
+  auto set_attrs = [&op](bool compute_uv, bool full_matrices) {
+    TF_CHECK_OK(NodeDefBuilder("test", "BatchSvd")
+                    .Input({"input", 0, DT_FLOAT})
+                    .Attr("compute_uv", compute_uv)
+                    .Attr("full_matrices", full_matrices)
+                    .Finalize(&op.node_def));
+  };
+  set_attrs(false, false);
+  INFER_OK(op, "?", "?;[0];[0]");
+  INFER_OK(op, "[?,?,?]", "[d0_0,?];[0];[0]");
+  INFER_OK(op, "[4,?,?]", "[d0_0,?];[0];[0]");
+  INFER_OK(op, "[4,2,?]", "[d0_0,?];[0];[0]");
+  INFER_OK(op, "[4,?,2]", "[d0_0,?];[0];[0]");
+  INFER_OK(op, "[?,2,2]", "[d0_0,d0_1];[0];[0]");
+  INFER_OK(op, "[4,2,2]", "[d0_0,d0_1];[0];[0]");
+  INFER_OK(op, "[?,3,2]", "[d0_0,d0_2];[0];[0]");
+  INFER_OK(op, "[4,3,2]", "[d0_0,d0_2];[0];[0]");
+  INFER_OK(op, "[?,2,3]", "[d0_0,d0_1];[0];[0]");
+  INFER_OK(op, "[4,2,3]", "[d0_0,d0_1];[0];[0]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
+
+  set_attrs(true, false);
+  INFER_OK(op, "?", "?;?;?");
+  INFER_OK(op, "[?,?,?]", "[d0_0,?];[d0_0,d0_1,?];[d0_0,d0_2,?]");
+  INFER_OK(op, "[4,?,?]", "[d0_0,?];[d0_0,d0_1,?];[d0_0,d0_2,?]");
+  INFER_OK(op, "[4,2,?]", "[d0_0,?];[d0_0,d0_1,?];[d0_0,d0_2,?]");
+  INFER_OK(op, "[4,?,2]", "[d0_0,?];[d0_0,d0_1,?];[d0_0,d0_2,?]");
+  INFER_OK(op, "[?,2,2]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_1]");
+  INFER_OK(op, "[4,2,2]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_1]");
+  INFER_OK(op, "[?,3,2]", "[d0_0,d0_2];[d0_0,d0_1,d0_2];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[4,3,2]", "[d0_0,d0_2];[d0_0,d0_1,d0_2];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[?,2,3]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_1]");
+  INFER_OK(op, "[4,2,3]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_1]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
+
+  set_attrs(true, true);
+  INFER_OK(op, "?", "?;?;?");
+  INFER_OK(op, "[?,?,?]", "[d0_0,?];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[4,?,?]", "[d0_0,?];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[4,2,?]", "[d0_0,?];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[4,?,2]", "[d0_0,?];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[?,2,2]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[4,2,2]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[?,3,2]", "[d0_0,d0_2];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[4,3,2]", "[d0_0,d0_2];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[?,2,3]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_OK(op, "[4,2,3]", "[d0_0,d0_1];[d0_0,d0_1,d0_1];[d0_0,d0_2,d0_2]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/logging_ops.cc b/tensorflow/core/ops/logging_ops.cc
index 5d8d34988df..49fd72a4f0c 100644
--- a/tensorflow/core/ops/logging_ops.cc
+++ b/tensorflow/core/ops/logging_ops.cc
@@ -92,6 +92,7 @@ REGISTER_OP("ScalarSummary")
     .Input("values: T")
     .Output("summary: string")
     .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Outputs a `Summary` protocol buffer with scalar values.
 
@@ -108,6 +109,7 @@ REGISTER_OP("HistogramSummary")
     .Input("values: T")
     .Output("summary: string")
     .Attr("T: realnumbertype = DT_FLOAT")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Outputs a `Summary` protocol buffer with a histogram.
 
@@ -132,6 +134,7 @@ REGISTER_OP("ImageSummary")
         "bad_color: tensor = { dtype: DT_UINT8 "
         "tensor_shape: { dim { size: 4 } } "
         "int_val: 255 int_val: 0 int_val: 0 int_val: 255 }")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Outputs a `Summary` protocol buffer with images.
 
@@ -183,6 +186,7 @@ REGISTER_OP("AudioSummary")
     .Output("summary: string")
     .Attr("sample_rate: float")
     .Attr("max_outputs: int >= 1 = 3")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Outputs a `Summary` protocol buffer with audio.
 
@@ -209,6 +213,7 @@ REGISTER_OP("MergeSummary")
     .Input("inputs: N * string")
     .Output("summary: string")
     .Attr("N : int >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Merges summaries.
 
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index c7a047b03dd..dd41bf2a671 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -499,7 +499,8 @@ REGISTER_OP("Add")
     .Doc(R"doc(
 Returns x + y element-wise.
 
-*NOTE*: Add supports broadcasting. AddN does not.
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 REGISTER_OP("Sub")
@@ -507,6 +508,9 @@ REGISTER_OP("Sub")
     .SetShapeFn(BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns x - y element-wise.
+
+*NOTE*: `Sub` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 REGISTER_OP("Mul")
@@ -515,10 +519,16 @@ REGISTER_OP("Mul")
     .SetShapeFn(BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns x * y element-wise.
+
+*NOTE*: `Mul` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 REGISTER_OP("Div").BINARY_MORE().SetShapeFn(BroadcastBinaryOpShapeFn).Doc(R"doc(
 Returns x / y element-wise.
+
+*NOTE*: `Div` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 REGISTER_OP("SquaredDifference")
@@ -527,6 +537,9 @@ REGISTER_OP("SquaredDifference")
     .SetShapeFn(BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns (x - y)(x - y) element-wise.
+
+*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 #undef BINARY_FEWER
@@ -540,7 +553,10 @@ REGISTER_OP("Maximum")
     .SetIsCommutative()
     .SetShapeFn(BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
-Returns the max of x and y (i.e. x > y ? x : y) element-wise, broadcasts.
+Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+
+*NOTE*: `Maximum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 REGISTER_OP("Minimum")
@@ -551,7 +567,10 @@ REGISTER_OP("Minimum")
     .SetIsCommutative()
     .SetShapeFn(BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
-Returns the min of x and y (i.e. x < y ? x : y) element-wise, broadcasts.
+Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+
+*NOTE*: `Minimum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 REGISTER_OP("Mod")
@@ -562,6 +581,9 @@ REGISTER_OP("Mod")
     .SetShapeFn(BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns element-wise remainder of division.
+
+*NOTE*: `Mod` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 REGISTER_OP("Pow")
@@ -679,24 +701,36 @@ REGISTER_OP("Less")
     .COMPARISON()
     .Doc(R"doc(
 Returns the truth value of (x < y) element-wise.
+
+*NOTE*: `Less` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 REGISTER_OP("LessEqual")
     .COMPARISON()
     .Doc(R"doc(
 Returns the truth value of (x <= y) element-wise.
+
+*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 REGISTER_OP("Greater")
     .COMPARISON()
     .Doc(R"doc(
 Returns the truth value of (x > y) element-wise.
+
+*NOTE*: `Greater` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 REGISTER_OP("GreaterEqual")
     .COMPARISON()
     .Doc(R"doc(
 Returns the truth value of (x >= y) element-wise.
+
+*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 #undef COMPARISON
@@ -718,12 +752,18 @@ REGISTER_OP("Equal")
     .EQUALITY_COMPARISON()
     .Doc(R"doc(
 Returns the truth value of (x == y) element-wise.
+
+*NOTE*: `Equal` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 REGISTER_OP("NotEqual")
     .EQUALITY_COMPARISON()
     .Doc(R"doc(
 Returns the truth value of (x != y) element-wise.
+
+*NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 #undef EQUALITY_COMPARISON
@@ -749,12 +789,18 @@ REGISTER_OP("LogicalAnd")
     .BINARY_LOGICAL()
     .Doc(R"doc(
 Returns the truth value of x AND y element-wise.
+
+*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 REGISTER_OP("LogicalOr")
     .BINARY_LOGICAL()
     .Doc(R"doc(
 Returns the truth value of x OR y element-wise.
+
+*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
 #undef BINARY_LOGICAL
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 7cce83c66ff..d38a5350831 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -533,6 +533,7 @@ REGISTER_OP("Conv3D")
     .Attr("T: numbertype")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
+    .SetShapeFn(shape_inference::Conv3DShape)
     .Doc(R"doc(
 Computes a 3-D convolution given 5-D `input` and `filter` tensors.
 
@@ -677,6 +678,7 @@ REGISTER_OP("AvgPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr("T: numbertype")
+    .SetShapeFn(shape_inference::Pool3DShape)
     .Doc(R"doc(
 Performs 3D average pooling on the input.
 
@@ -726,6 +728,7 @@ REGISTER_OP("MaxPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr("T: numbertype")
+    .SetShapeFn(shape_inference::Pool3DShape)
     .Doc(R"doc(
 Performs 3D max pooling on the input.
 
@@ -769,6 +772,7 @@ REGISTER_OP("L2Loss")
     .Input("t: T")
     .Output("output: T")
     .Attr("T: numbertype")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 L2 Loss.
 
@@ -857,6 +861,7 @@ REGISTER_OP("MaxPool")
     .Attr(GetConvnetDataFormatAttrString())
     .Input("input: T")
     .Output("output: T")
+    .SetShapeFn(shape_inference::MaxPoolShape)
     .Doc(R"doc(
 Performs max pooling on the input.
 
@@ -913,6 +918,11 @@ REGISTER_OP("MaxPoolWithArgmax")
     .Output("output: T")
     .Output("argmax: Targmax")
     .Attr("T: {float, half} = DT_FLOAT")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::MaxPoolShape(c));
+      c->set_output(1, c->output(0));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Performs max pooling on the input and outputs both max values and indices.
 
@@ -966,6 +976,77 @@ REGISTER_OP("Dilation2D")
     .Attr("strides: list(int) >= 4")
     .Attr("rates: list(int) >= 4")
     .Attr(GetPaddingAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* input_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+      const Shape* filter_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &filter_shape));
+
+      std::vector<int32> strides;
+      TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
+      if (strides.size() != 4) {
+        return errors::InvalidArgument(
+            "Dilation2D requires the stride attribute to contain 4 values, but "
+            "got: ",
+            strides.size());
+      }
+
+      std::vector<int32> rates;
+      TF_RETURN_IF_ERROR(c->GetAttr("rates", &rates));
+      if (rates.size() != 4) {
+        return errors::InvalidArgument(
+            "Dilation2D requires the rates attribute to contain 4 values, but "
+            "got: ",
+            rates.size());
+      }
+
+      int32 stride_rows = strides[1];
+      int32 stride_cols = strides[2];
+
+      int32 rate_rows = rates[1];
+      int32 rate_cols = rates[2];
+
+      const Dimension* batch_size_dim = c->Dim(input_shape, 0);
+      const Dimension* in_rows_dim = c->Dim(input_shape, 1);
+      const Dimension* in_cols_dim = c->Dim(input_shape, 2);
+      const Dimension* filter_rows_dim = c->Dim(filter_shape, 0);
+      const Dimension* filter_cols_dim = c->Dim(filter_shape, 1);
+      const Dimension* output_depth_dim = c->Dim(filter_shape, 2);
+
+      const Dimension* unused;
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->Dim(input_shape, 3), output_depth_dim, &unused));
+
+      // At the moment we need to know the values of several fields.
+      TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_rows_dim, "in_rows"));
+      TF_RETURN_IF_ERROR(c->ValidateKnownDim(in_cols_dim, "in_cols"));
+      TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_rows_dim, "filter_rows"));
+      TF_RETURN_IF_ERROR(c->ValidateKnownDim(filter_cols_dim, "filter_cols"));
+
+      auto in_rows = c->Value(in_rows_dim);
+      auto in_cols = c->Value(in_cols_dim);
+      auto filter_rows = c->Value(filter_rows_dim);
+      auto filter_cols = c->Value(filter_cols_dim);
+      auto filter_rows_eff = filter_rows + (filter_rows - 1) * (rate_rows - 1);
+      auto filter_cols_eff = filter_cols + (filter_cols - 1) * (rate_cols - 1);
+
+      Padding padding;
+      TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
+
+      int64 output_rows, output_cols;
+      int64 padding_before, padding_after;
+      TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+          in_rows, filter_rows_eff, stride_rows, padding, &output_rows,
+          &padding_before, &padding_after));
+      TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+          in_cols, filter_cols_eff, stride_cols, padding, &output_cols,
+          &padding_before, &padding_after));
+
+      const Shape* output_shape = c->MakeShape(
+          {batch_size_dim, output_rows, output_cols, output_depth_dim});
+      c->set_output(0, output_shape);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
 
@@ -1184,7 +1265,7 @@ REGISTER_OP("Softmax")
     .Output("softmax: T")
     .Attr("T: {half, float, double}")
     .SetShapeFn([](InferenceContext* c) {
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 2);
+      return shape_inference::UnchangedShapeWithRank(c, 2);
     })
     .Doc(R"doc(
 Computes softmax activations.
@@ -1204,7 +1285,7 @@ REGISTER_OP("LogSoftmax")
     .Output("logsoftmax: T")
     .Attr("T: {half, float, double}")
     .SetShapeFn([](InferenceContext* c) {
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 2);
+      return shape_inference::UnchangedShapeWithRank(c, 2);
     })
     .Doc(R"doc(
 Computes log softmax activations.
diff --git a/tensorflow/core/ops/nn_ops_test.cc b/tensorflow/core/ops/nn_ops_test.cc
index 9056217db3a..773a8f30080 100644
--- a/tensorflow/core/ops/nn_ops_test.cc
+++ b/tensorflow/core/ops/nn_ops_test.cc
@@ -312,4 +312,29 @@ TEST(NNOpsTest, InTopK_ShapeFn) {
   INFER_ERROR("Shape must be rank 1 but is rank 2", op, "?;[1,2]");
 }
 
+TEST(NNOpsTest, Dilation2DShapeTest) {
+  ShapeInferenceTestOp op("Dilation2D");
+  auto set_op = [&op](const std::vector<int32>& strides,
+                      const std::vector<int32>& rates, const string& padding) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Dilation2D")
+                    .Input("input", 0, DT_FLOAT)
+                    .Input("filter", 0, DT_FLOAT)
+                    .Attr("strides", strides)
+                    .Attr("rates", rates)
+                    .Attr("padding", padding)
+                    .Finalize(&op.node_def));
+  };
+
+  // rate rows and cols is 1, so filter_rows and cols are unchanged.
+  // We have a 1x1 filter so the output is still 2x2.
+  set_op({1, 1, 1, 1}, {1, 1, 1, 1}, "VALID");
+  INFER_OK(op, "[1,2,2,2];[1,1,2]", "[d0_0,2,2,d1_2]");
+
+  // rate rows and cols is 2, so filter_rows and cols are changed to
+  // be 2 + (2 - 1) = 3.  7x7 input with 3x3 filter and 1x1 stride
+  // gives a 5x5 output.
+  set_op({1, 1, 1, 1}, {1, 2, 2, 1}, "VALID");
+  INFER_OK(op, "[1,7,7,2];[2,2,2]", "[d0_0,5,5,d1_2]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/no_op.cc b/tensorflow/core/ops/no_op.cc
index ccec5851189..94eaec3fdf8 100644
--- a/tensorflow/core/ops/no_op.cc
+++ b/tensorflow/core/ops/no_op.cc
@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 
 namespace tensorflow {
 
+using shape_inference::InferenceContext;
+
 REGISTER_OP("NoOp")
-    .Doc(R"doc(
-Does nothing. Only useful as a placeholder for control edges.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc("Does nothing. Only useful as a placeholder for control edges.");
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 80805d30f0f..8bc7e5b86c9 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -98,7 +98,7 @@ op {
     }
   }
   summary: "Returns x + y element-wise."
-  description: "*NOTE*: Add supports broadcasting. AddN does not."
+  description: "*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "AddN"
@@ -1886,7 +1886,7 @@ op {
       }
     }
   }
-  summary: "Calculates the Cholesky decomposition of a batch of square matrices."
+  summary: "Computes the Cholesky decomposition of a batch of square matrices."
   description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices, with the same constraints as the single matrix Cholesky\ndecomposition above. The output is a tensor of the same shape as the input\ncontaining the Cholesky decompositions for all input submatrices `[..., :, :]`."
 }
 op {
@@ -1898,12 +1898,12 @@ op {
   }
   input_arg {
     name: "grad"
-    description: "df/dl where f is some scalar function. Shape is `[..., M, M]\'.\nAlgorithm depends only on lower triangular part of the innermost matrices of\nthis tensor."
+    description: "df/dl where f is some scalar function. Shape is `[..., M, M]`.\nAlgorithm depends only on lower triangular part of the innermost matrices of\nthis tensor."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Symmetrized version of df/dA . Shape is `[..., M, M]\'"
+    description: "Symmetrized version of df/dA . Shape is `[..., M, M]`"
     type_attr: "T"
   }
   attr {
@@ -1916,7 +1916,7 @@ op {
       }
     }
   }
-  summary: "Calculates the reverse mode backpropagated gradient of the Cholesky algorithm."
+  summary: "Computes the reverse mode backpropagated gradient of the Cholesky algorithm."
   description: "For an explanation see \"Differentiation of the Cholesky algorithm\" by\nIain Murray http://arxiv.org/abs/1602.07527."
 }
 op {
@@ -2110,7 +2110,7 @@ op {
       }
     }
   }
-  summary: "Calculates the determinants for a batch of square matrices."
+  summary: "Computes the determinants for a batch of square matrices."
   description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. The output is a tensor containing the determinants\nfor all input submatrices `[..., :, :]`."
 }
 op {
@@ -2180,7 +2180,7 @@ op {
       }
     }
   }
-  summary: "Calculates the inverse of square invertible matrices or their adjoints"
+  summary: "Computes the inverse of square invertible matrices or their adjoints"
   description: "(conjugate transposes).\n\nThe input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. The output is a tensor of the same shape as the input\ncontaining the inverse for all input submatrices `[..., :, :]`.\n\nThe op uses LU decomposition with partial pivoting to compute the inverses.\n\nIf a matrix is not invertible there is no guarantee what the op does. It\nmay detect the condition and raise an exception or it may simply return a\ngarbage result."
 }
 op {
@@ -2284,7 +2284,7 @@ op {
     }
   }
   summary: "Solves multiple linear least-squares problems."
-  description: "`matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions\nform square matrices. Rhs is a tensor of shape `[..., M, K]`. The output\nis a tensor shape `[..., N, K]` where each output matrix solves each of\nthe equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :] in the\nleast squares sense.\n\nBelow we will use the following notation for each pair of\nmatrix and right-hand sides in the batch:\n\n`matrix`=\\\\(A \\in \\Re^{m \\times n}\\\\),\n`rhs`=\\\\(B  \\in \\Re^{m \\times k}\\\\),\n`output`=\\\\(X  \\in \\Re^{n \\times k}\\\\),\n`l2_regularizer`=\\\\(\\lambda\\\\).\n\nIf `fast` is `True`, then the solution is computed by solving the normal\nequations using Cholesky decomposition. Specifically, if \\\\(m \\ge n\\\\) then\n\\\\(X = (A^T A + \\lambda I)^{-1} A^T B\\\\), which solves the least-squares\nproblem \\\\(X = \\mathrm{argmin}_{Z \\in \\Re^{n \\times k}} ||A Z - B||_F^2 +\n\\lambda ||Z||_F^2\\\\). If \\\\(m \\lt n\\\\) then `output` is computed as\n\\\\(X = A^T (A A^T + \\lambda I)^{-1} B\\\\), which (for \\\\(\\lambda = 0\\\\)) is the\nminimum-norm solution to the under-determined linear system, i.e.\n\\\\(X = \\mathrm{argmin}_{Z \\in \\Re^{n \\times k}} ||Z||_F^2 \\\\), subject to\n\\\\(A Z = B\\\\). Notice that the fast path is only numerically stable when\n\\\\(A\\\\) is numerically full rank and has a condition number\n\\\\(\\mathrm{cond}(A) \\lt \\frac{1}{\\sqrt{\\epsilon_{mach}}}\\\\) or\\\\(\\lambda\\\\) is\nsufficiently large.\n\nIf `fast` is `False` an algorithm based on the numerically robust complete\northogonal decomposition is used. This computes the minimum-norm\nleast-squares solution, even when \\\\(A\\\\) is rank deficient. This path is\ntypically 6-7 times slower than the fast path. If `fast` is `False` then\n`l2_regularizer` is ignored."
+  description: "`matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions\nform matrices of size `[M, N]`. Rhs is a tensor of shape `[..., M, K]`.\nThe output is a tensor shape `[..., N, K]` where each output matrix solves\neach of the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]\nin the least squares sense.\n\nBelow we will use the following notation for each pair of\nmatrix and right-hand sides in the batch:\n\n`matrix`=\\\\(A \\in \\Re^{m \\times n}\\\\),\n`rhs`=\\\\(B  \\in \\Re^{m \\times k}\\\\),\n`output`=\\\\(X  \\in \\Re^{n \\times k}\\\\),\n`l2_regularizer`=\\\\(\\lambda\\\\).\n\nIf `fast` is `True`, then the solution is computed by solving the normal\nequations using Cholesky decomposition. Specifically, if \\\\(m \\ge n\\\\) then\n\\\\(X = (A^T A + \\lambda I)^{-1} A^T B\\\\), which solves the least-squares\nproblem \\\\(X = \\mathrm{argmin}_{Z \\in \\Re^{n \\times k}} ||A Z - B||_F^2 +\n\\lambda ||Z||_F^2\\\\). If \\\\(m \\lt n\\\\) then `output` is computed as\n\\\\(X = A^T (A A^T + \\lambda I)^{-1} B\\\\), which (for \\\\(\\lambda = 0\\\\)) is the\nminimum-norm solution to the under-determined linear system, i.e.\n\\\\(X = \\mathrm{argmin}_{Z \\in \\Re^{n \\times k}} ||Z||_F^2 \\\\), subject to\n\\\\(A Z = B\\\\). Notice that the fast path is only numerically stable when\n\\\\(A\\\\) is numerically full rank and has a condition number\n\\\\(\\mathrm{cond}(A) \\lt \\frac{1}{\\sqrt{\\epsilon_{mach}}}\\\\) or\\\\(\\lambda\\\\) is\nsufficiently large.\n\nIf `fast` is `False` an algorithm based on the numerically robust complete\northogonal decomposition is used. This computes the minimum-norm\nleast-squares solution, even when \\\\(A\\\\) is rank deficient. This path is\ntypically 6-7 times slower than the fast path. If `fast` is `False` then\n`l2_regularizer` is ignored."
 }
 op {
   name: "BatchMatrixTriangularSolve"
@@ -2515,8 +2515,101 @@ op {
       }
     }
   }
-  summary: "Calculates the Eigen Decomposition of a batch of square self-adjoint matrices."
-  description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices, with the same constraints as the single matrix\nSelfAdjointEig.\n\nThe result is a \'[..., M+1, M] matrix with [..., 0,:] containing the\neigenvalues, and subsequent [...,1:, :] containing the eigenvectors."
+  summary: "Computes the Eigen Decomposition of a batch of square self-adjoint matrices."
+  description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices, with the same constraints as the single matrix\nSelfAdjointEig.\n\nThe result is a [..., M+1, M] matrix with [..., 0,:] containing the\neigenvalues, and subsequent [...,1:, :] containing the eigenvectors."
+  deprecation {
+    version: 11
+    explanation: "Use BatchSelfAdjointEigV2 instead."
+  }
+}
+op {
+  name: "BatchSelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    description: "`Tensor` input of shape `[N, N]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    description: "Eigenvalues. Shape is `[N]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    description: "Eigenvectors. Shape is `[N, N]`."
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+    description: "If `True` then eigenvectors will be computed and returned in `v`.\nOtherwise, only the eigenvalues will be computed."
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  summary: "Computes the eigen decomposition of a batch of square self-adjoint matrices."
+  description: "Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in\n`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.\n\n```prettyprint\n# a is a tensor.\n# e is a tensor of eigenvalues.\n# v is a tensor of eigenvectors.\ne, v = batch_self_adjoint_eig(a)\ne = batch_self_adjoint_eig(a, compute_v=False)\n```"
+}
+op {
+  name: "BatchSvd"
+  input_arg {
+    name: "input"
+    description: "A tensor of shape `[..., M, N]` whose inner-most 2 dimensions\nform matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    description: "Singular values. Shape is `[..., P]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    description: "Left singular vectors. If `full_matrices` is `False` then shape is\n`[..., M, M]`; if `full_matrices` is `True` then shape is\n`[..., M, P]`. Undefined if `compute_uv` is `False`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    description: "Left singular vectors. If `full_matrices` is `False` then shape is\n`[..., N, N]`. If `full_matrices` is `True` then shape is `[..., N, P]`.\nUndefined if `compute_uv` is false."
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+    description: "If true, left and right singular vectors will be\ncomputed and returned in `u` and `v`, respectively.\nIf false, `u` and `v` are not set and should never referenced."
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If true, compute full-sized `u` and `v`. If false\n(the default), compute only the leading `P` singular vectors.\nIgnored if `compute_uv` is `False`."
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  summary: "Computes the singular value decompositions of a batch of matrices."
+  description: "Computes the SVD of each inner matrix in `input` such that\n`input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`\n\n```prettyprint\n# a is a tensor containing a batch of matrices.\n# s is a tensor of singular values for each matrix.\n# u is the tensor containing of left singular vectors for each matrix.\n# v is the tensor containing of right singular vectors for each matrix.\ns, u, v = batch_svd(a)\ns, _, _ = batch_svd(a, compute_uv=False)\n```"
 }
 op {
   name: "BatchToSpace"
@@ -3023,7 +3116,7 @@ op {
       }
     }
   }
-  summary: "Calculates the Cholesky decomposition of a square matrix."
+  summary: "Computes the Cholesky decomposition of a square matrix."
   description: "The input has to be symmetric and positive definite. Only the lower-triangular\npart of the input will be used for this operation. The upper-triangular part\nwill not be read.\n\nThe result is the lower-triangular matrix of the Cholesky decomposition of the\ninput, `L`, so that `input = L L^*`."
 }
 op {
@@ -3035,12 +3128,12 @@ op {
   }
   input_arg {
     name: "grad"
-    description: "df/dl where f is some scalar function. Shape is `[M, M]\'.\nAlgorithm depends only on lower triangular part of this matrix."
+    description: "df/dl where f is some scalar function. Shape is `[M, M]`.\nAlgorithm depends only on lower triangular part of this matrix."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Symmetrized version of df/dA . Shape is `[M, M]\'."
+    description: "Symmetrized version of df/dA . Shape is `[M, M]`."
     type_attr: "T"
   }
   attr {
@@ -3053,7 +3146,7 @@ op {
       }
     }
   }
-  summary: "Calculates the reverse mode backpropagated gradient of the Cholesky algorithm."
+  summary: "Computes the reverse mode backpropagated gradient of the Cholesky algorithm."
   description: "For an explanation see \"Differentiation of the Cholesky algorithm\" by\nIain Murray http://arxiv.org/abs/1602.07527."
 }
 op {
@@ -3292,8 +3385,8 @@ op {
 }
 op {
   name: "ControlTrigger"
-  summary: "Does nothing. Serves as a control trigger for scheduling. Only useful as a"
-  description: "placeholder for control edges."
+  summary: "Does nothing. Serves as a control trigger for scheduling."
+  description: "Only useful as a placeholder for control edges."
 }
 op {
   name: "Conv2D"
@@ -4278,6 +4371,15 @@ op {
     }
     description: "Name of the input tensor."
   }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+    description: "List of URLs to debug targets, e.g.,\nfile:///foo/tfdbg_dump, grpc:://localhost:11011"
+  }
   summary: "Debug Identity Op."
   description: "Provides an identity mapping of the non-Ref type input tensor for debugging."
 }
@@ -4305,6 +4407,15 @@ op {
     }
     description: "Name of the input tensor."
   }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+    description: "List of URLs to debug targets, e.g.,\nfile:///foo/tfdbg_dump, grpc:://localhost:11011"
+  }
   summary: "Debug NaN Value Counter Op"
   description: "Counts number of NaNs in the input tensor, for debugging."
 }
@@ -5077,6 +5188,7 @@ op {
     }
   }
   summary: "Returns x / y element-wise."
+  description: "*NOTE*: `Div` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "DrawBoundingBoxes"
@@ -5488,6 +5600,7 @@ op {
     }
   }
   summary: "Returns the truth value of (x == y) element-wise."
+  description: "*NOTE*: `Equal` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -6168,6 +6281,7 @@ op {
     }
   }
   summary: "Returns the truth value of (x > y) element-wise."
+  description: "*NOTE*: `Greater` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "GreaterEqual"
@@ -6201,6 +6315,7 @@ op {
     }
   }
   summary: "Returns the truth value of (x >= y) element-wise."
+  description: "*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "HSVToRGB"
@@ -7086,6 +7201,7 @@ op {
     }
   }
   summary: "Returns the truth value of (x < y) element-wise."
+  description: "*NOTE*: `Less` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "LessEqual"
@@ -7119,6 +7235,7 @@ op {
     }
   }
   summary: "Returns the truth value of (x <= y) element-wise."
+  description: "*NOTE*: `LessEqual` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "Lgamma"
@@ -7341,6 +7458,7 @@ op {
     type: DT_BOOL
   }
   summary: "Returns the truth value of x AND y element-wise."
+  description: "*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -7370,6 +7488,7 @@ op {
     type: DT_BOOL
   }
   summary: "Returns the truth value of x OR y element-wise."
+  description: "*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -7576,7 +7695,7 @@ op {
       }
     }
   }
-  summary: "Calculates the determinant of a square matrix."
+  summary: "Computes the determinant of a square matrix."
 }
 op {
   name: "MatrixInverse"
@@ -7607,7 +7726,7 @@ op {
       }
     }
   }
-  summary: "Calculates the inverse of a square invertible matrix or its adjoint (conjugate"
+  summary: "Computes the inverse of a square invertible matrix or its adjoint (conjugate"
   description: "transpose).\n\nThe op uses LU decomposition with partial pivoting to compute the inverse.\n\nIf the matrix is not invertible there is no guarantee what the op does. It\nmay detect the condition and raise an exception or it may simply return a\ngarbage result."
 }
 op {
@@ -8227,7 +8346,8 @@ op {
       }
     }
   }
-  summary: "Returns the max of x and y (i.e. x > y ? x : y) element-wise, broadcasts."
+  summary: "Returns the max of x and y (i.e. x > y ? x : y) element-wise."
+  description: "*NOTE*: `Maximum` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -8410,7 +8530,8 @@ op {
       }
     }
   }
-  summary: "Returns the min of x and y (i.e. x < y ? x : y) element-wise, broadcasts."
+  summary: "Returns the min of x and y (i.e. x < y ? x : y) element-wise."
+  description: "*NOTE*: `Minimum` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -8510,6 +8631,7 @@ op {
     }
   }
   summary: "Returns element-wise remainder of division."
+  description: "*NOTE*: `Mod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "Mul"
@@ -8544,6 +8666,7 @@ op {
     }
   }
   summary: "Returns x * y element-wise."
+  description: "*NOTE*: `Mul` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -8845,6 +8968,7 @@ op {
     }
   }
   summary: "Returns the truth value of (x != y) element-wise."
+  description: "*NOTE*: `NotEqual` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -12095,8 +12219,50 @@ op {
       }
     }
   }
-  summary: "Calculates the Eigen Decomposition of a square Self-Adjoint matrix."
+  summary: "Computes the Eigen Decomposition of a square Self-Adjoint matrix."
   description: "Only the lower-triangular part of the input will be used in this case. The\nupper-triangular part will not be read.\n\nThe result is a M+1 x M matrix whose first row is the eigenvalues, and\nsubsequent rows are eigenvectors."
+  deprecation {
+    version: 11
+    explanation: "Use SelfAdjointEigV2 instead."
+  }
+}
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    description: "`Tensor` input of shape `[N, N]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    description: "Eigenvalues. Shape is `[N]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    description: "Eigenvectors. Shape is `[N, N]`."
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+    description: "If `True` then eigenvectors will be computed and returned in `v`.\nOtherwise, only the eigenvalues will be computed."
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  summary: "Computes the eigen decomposition of a self-adjoint (\\\"symmetric\\\") matrix."
+  description: "Computes the eigenvalues and (optionally) eigenvectors such that\n`input = v * diag(e)`.\n\n```prettyprint\n# a is a self-adjoint matrix.\n# e is a vector of eigenvalues.\n# v is a matrix of eigenvectors.\ne, v = self_adjoint_eig(a)\ne = self_adjoint_eig(a, compute_v=False)\n```"
 }
 op {
   name: "SerializeManySparse"
@@ -14625,6 +14791,7 @@ op {
     }
   }
   summary: "Returns (x - y)(x - y) element-wise."
+  description: "*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -15076,6 +15243,7 @@ op {
     }
   }
   summary: "Returns x - y element-wise."
+  description: "*NOTE*: `Sub` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "Sum"
@@ -15127,6 +15295,57 @@ op {
   summary: "Computes the sum of elements across dimensions of a tensor."
   description: "Reduces `input` along the dimensions given in `reduction_indices`. Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_indices`. If `keep_dims` is true, the reduced dimensions are\nretained with length 1."
 }
+op {
+  name: "Svd"
+  input_arg {
+    name: "input"
+    description: "Shape is `[M, N]`. Let `P` be the minimum of `M` and `N`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    description: "Singular values. Shape is `[P]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    description: "Left singular vectors; if `full_matrices` is `False` then shape is `[M, M]`.\nIf `full_matrices` is `True` then shape is `[M, P]`.\nUndefined if `compute_uv` is `False`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    description: "Left singular vectors. If `full_matrices` is `False` then shape is `[N, N]`.\nIf `full_matrices` is `True` then shape is `[N, P]`.\nUndefined if `compute_uv` is false."
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+    description: "If true, left and right singular vectors will be\ncomputed and returned in `u` and `v`, respectively.\nIf false, `u` and `v` are not set and should never referenced."
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If true, compute full-sized `u` and `v`. If false\n(the default), compute only the leading `P` singular vectors.\nIgnored if `compute_uv` is `False`."
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  summary: "Computes the singular value decomposition of a matrix."
+  description: "Computes the SVD of if `input` such that `input = u * diag(s) * transpose(v)`\n\n```prettyprint\n# a is a matrix.\n# s is a vector of singular values.\n# u is the matrix of left singular vectors.\n# v is a matrix of right singular vectors.\ns, u, v = svd(a)\ns, _, _ = svd(a, compute_uv=False)\n```"
+}
 op {
   name: "Switch"
   input_arg {
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index 6e1cdef05fa..5d648a6a7eb 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -15,9 +15,25 @@ limitations under the License.
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
 namespace tensorflow {
 
+using shape_inference::Dimension;
+using shape_inference::InferenceContext;
+using shape_inference::Shape;
+
+namespace {
+
+Status RandomShape(InferenceContext* c) {
+  const Shape* out;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+  c->set_output(0, out);
+  return Status::OK();
+}
+
+}  // namepsace
+
 REGISTER_OP("RandomUniform")
     .Input("shape: T")
     .SetIsStateful()
@@ -26,6 +42,7 @@ REGISTER_OP("RandomUniform")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,float,double}")
     .Attr("T: {int32, int64}")
+    .SetShapeFn(RandomShape)
     .Doc(R"doc(
 Outputs random values from a uniform distribution.
 
@@ -52,6 +69,7 @@ REGISTER_OP("RandomUniformInt")
     .Attr("seed2: int = 0")
     .Attr("Tout: {int32, int64}")
     .Attr("T: {int32, int64}")
+    .SetShapeFn(RandomShape)
     .Doc(R"doc(
 Outputs random integers from a uniform distribution.
 
@@ -82,6 +100,7 @@ REGISTER_OP("RandomStandardNormal")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,float,double}")
     .Attr("T: {int32, int64}")
+    .SetShapeFn(RandomShape)
     .Doc(R"doc(
 Outputs random values from a normal distribution.
 
@@ -109,6 +128,7 @@ REGISTER_OP("ParameterizedTruncatedNormal")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,float,double}")
     .Attr("T: {int32, int64}")
+    .SetShapeFn(RandomShape)
     .Doc(R"doc(
 Outputs random values from a normal distribution. The parameters may each be a
 scalar which applies to the entire output, or a vector of length shape[0] which
@@ -138,6 +158,7 @@ REGISTER_OP("TruncatedNormal")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,float,double}")
     .Attr("T: {int32, int64}")
+    .SetShapeFn(RandomShape)
     .Doc(R"doc(
 Outputs random values from a truncated normal distribution.
 
@@ -195,6 +216,16 @@ REGISTER_OP("Multinomial")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .Attr("T: realnumbertype")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* logits_shape;
+      const Shape* unused;
+      const Dimension* num_samples;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &logits_shape));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(1, &num_samples));
+      c->set_output(0, c->Matrix(c->Dim(logits_shape, 0), num_samples));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Draws samples from a multinomial distribution.
 
@@ -217,6 +248,13 @@ REGISTER_OP("RandomGamma")
     .Attr("seed2: int = 0")
     .Attr("S: {int32, int64}")
     .Attr("T: {half, float, double}")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+      TF_RETURN_IF_ERROR(c->Concatenate(out, c->input(1), &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Outputs random values from the Gamma distribution(s) described by alpha.
 
diff --git a/tensorflow/core/ops/random_ops_test.cc b/tensorflow/core/ops/random_ops_test.cc
new file mode 100644
index 00000000000..524e1079981
--- /dev/null
+++ b/tensorflow/core/ops/random_ops_test.cc
@@ -0,0 +1,56 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(RandomOpsTest, Multinomial_ShapeFn) {
+  ShapeInferenceTestOp op("Multinomial");
+  op.input_tensors.resize(2);
+
+  INFER_OK(op, "?;?", "[?,?]");
+  INFER_ERROR("Shape must be rank 2 but is rank 1", op, "[?];?");
+  INFER_OK(op, "[?,?];?", "[d0_0,?]");
+  INFER_OK(op, "[2,?];?", "[d0_0,?]");
+  INFER_OK(op, "[2,1];?", "[d0_0,?]");
+  Tensor num_samples = test::AsScalar<int32>(3);
+  op.input_tensors[1] = &num_samples;
+  INFER_OK(op, "[2,1];[]", "[d0_0,3]");
+  num_samples = test::AsTensor<int32>({1, 2, 3});
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[2,1];[3]");
+}
+
+TEST(RandomOpsTest, RandomGamma_ShapeFn) {
+  ShapeInferenceTestOp op("RandomGamma");
+  op.input_tensors.resize(2);
+
+  INFER_OK(op, "?;?", "?");
+  INFER_OK(op, "?;[3]", "?");
+  INFER_OK(op, "[1];?", "?");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[1,2];[3,4]");
+  Tensor shape = test::AsTensor<int32>({1, 2, 3});
+  op.input_tensors[0] = &shape;
+  INFER_OK(op, "[3];[4,?]", "[1,2,3,d1_0,d1_1]");
+  INFER_OK(op, "[3];[4,5]", "[1,2,3,d1_0,d1_1]");
+  INFER_OK(op, "[3];[]", "[1,2,3]");
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/ops/script_ops.cc b/tensorflow/core/ops/script_ops.cc
index e049b9af4ab..40144c62b11 100644
--- a/tensorflow/core/ops/script_ops.cc
+++ b/tensorflow/core/ops/script_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 
 namespace tensorflow {
@@ -24,6 +25,7 @@ REGISTER_OP("PyFunc")
     .Attr("Tin: list(type) >= 0")
     .Attr("Tout: list(type)")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Invokes a python function to compute func(input)->output.
 
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index 1ad9f7175fc..17d5983d76f 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -363,7 +363,7 @@ REGISTER_OP("SparseConcat")
     .Attr("T: type")
     .SetShapeFn([](InferenceContext* c) {
       // These accumulates the sum.
-      const Dimension* output_row_count = c->MakeDim(0);
+      const Dimension* output_row_count = c->MakeDim(0ll);
 
       // These are only merged.
       const Dimension* output_ind_cols = c->UnknownDim();
@@ -662,13 +662,19 @@ keep_dims: If true, retain reduced dimensions with length 1.
 output: `R-K`-D.  The reduced Tensor.
 )doc");
 
-#define SPARSE_DENSE_CWISE_SIGNATURE() \
-  Input("sp_indices: int64")           \
-      .Input("sp_values: T")           \
-      .Input("sp_shape: int64")        \
-      .Input("dense: T")               \
-      .Output("output: T")             \
-      .Attr("T: numbertype")
+#define SPARSE_DENSE_CWISE_SIGNATURE()                           \
+  Input("sp_indices: int64")                                     \
+      .Input("sp_values: T")                                     \
+      .Input("sp_shape: int64")                                  \
+      .Input("dense: T")                                         \
+      .Output("output: T")                                       \
+      .Attr("T: numbertype")                                     \
+      .SetShapeFn([](InferenceContext* c) {                      \
+        const Shape* input;                                      \
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &input)); \
+        c->set_output(0, c->Vector(c->Dim(input, 0)));           \
+        return Status::OK();                                     \
+      })
 
 REGISTER_OP("SparseDenseCwiseMul").SPARSE_DENSE_CWISE_SIGNATURE().Doc(R"doc(
 Component-wise multiplies a SparseTensor by a dense Tensor.
@@ -722,6 +728,8 @@ dense: `R`-D.  The dense Tensor operand.
 output: 1-D.  The `N` values that are operated on.
 )doc");
 
+#undef SPARSE_DENSE_CWISE_SIGNATURE
+
 REGISTER_OP("SparseSoftmax")
     .Input("sp_indices: int64")
     .Input("sp_values: T")
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index a80c3a18768..684e86a00dc 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 
 namespace tensorflow {
 
+using shape_inference::InferenceContext;
+using shape_inference::Shape;
+
 REGISTER_OP("Variable")
     .Output("ref: Ref(dtype)")
     .Attr("shape: shape")
@@ -24,6 +28,7 @@ REGISTER_OP("Variable")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Holds state in the form of a tensor that persists across steps.
 
@@ -41,10 +46,11 @@ shared_name: If non-empty, this variable is named in the given bucket
 )doc");
 
 REGISTER_OP("IsVariableInitialized")
-    .Output("is_initialized: bool")
     .Input("ref: Ref(dtype)")
+    .Output("is_initialized: bool")
     .Attr("dtype: type")
     .SetAllowsUninitializedInput()
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 Checks whether a tensor has been initialized.
 
@@ -60,6 +66,14 @@ REGISTER_OP("TemporaryVariable")
     .Attr("dtype: type")
     .Attr("var_name: string = ''")
     .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      TensorShapeProto shape_proto;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape_proto));
+      const Shape* output;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeProto(shape_proto, &output));
+      c->set_output(0, output);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Returns a tensor that may be mutated, but only persists within a single step.
 
@@ -90,6 +104,7 @@ REGISTER_OP("DestroyTemporaryVariable")
     .Output("value: T")
     .Attr("T: type")
     .Attr("var_name: string")
+    .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Destroys the temporary variable and returns its final value.
 
@@ -114,6 +129,16 @@ REGISTER_OP("Assign")
     .Attr("validate_shape: bool = true")
     .Attr("use_locking: bool = true")
     .SetAllowsUninitializedInput()
+    .SetShapeFn([](InferenceContext* c) {
+      bool validate_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("validate_shape", &validate_shape));
+      if (validate_shape) {
+        return shape_inference::MergeBothInputsShapeFn(c);
+      }
+
+      c->set_output(0, c->input(1));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Update 'ref' by assigning 'value' to it.
 
@@ -137,6 +162,7 @@ REGISTER_OP("AssignAdd")
     .Output("output_ref: Ref(T)")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
     .Doc(R"doc(
 Update 'ref' by adding 'value' to it.
 
@@ -157,6 +183,7 @@ REGISTER_OP("AssignSub")
     .Output("output_ref: Ref(T)")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
     .Doc(R"doc(
 Update 'ref' by subtracting 'value' from it.
 
@@ -171,6 +198,25 @@ output_ref:= Same as "ref".  Returned as a convenience for operations that want
   to use the new value after the variable has been updated.
 )doc");
 
+namespace {
+
+Status ScatterUpdateShape(InferenceContext* c) {
+  const Shape* var_shape = c->input(0);
+  const Shape* indices_shape = c->input(1);
+
+  const Shape* unused_updates_shape;
+  const Shape* concat;
+  const Shape* var_subshape;
+  TF_RETURN_IF_ERROR(c->Subshape(var_shape, 1, &var_subshape));
+  TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, var_subshape, &concat));
+  TF_RETURN_IF_ERROR(c->Merge(c->input(2), concat, &unused_updates_shape));
+
+  c->set_output(0, var_shape);
+  return Status::OK();
+}
+
+}  // namespace
+
 REGISTER_OP("ScatterUpdate")
     .Input("ref: Ref(T)")
     .Input("indices: Tindices")
@@ -179,6 +225,7 @@ REGISTER_OP("ScatterUpdate")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = true")
+    .SetShapeFn(ScatterUpdateShape)
     .Doc(R"doc(
 Applies sparse updates to a variable reference.
 
@@ -223,6 +270,7 @@ REGISTER_OP("ScatterAdd")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
+    .SetShapeFn(ScatterUpdateShape)
     .Doc(R"doc(
 Adds sparse updates to a variable reference.
 
@@ -266,6 +314,7 @@ REGISTER_OP("ScatterSub")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
+    .SetShapeFn(ScatterUpdateShape)
     .Doc(R"doc(
 Subtracts sparse updates to a variable reference.
 
@@ -304,6 +353,12 @@ REGISTER_OP("CountUpTo")
     .Output("output: T")
     .Attr("limit: int")
     .Attr("T: {int32, int64}")
+    .SetShapeFn([](InferenceContext* c) {
+      const Shape* output;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &output));
+      c->set_output(0, output);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Increments 'ref' until it reaches 'limit'.
 
diff --git a/tensorflow/core/ops/state_ops_test.cc b/tensorflow/core/ops/state_ops_test.cc
new file mode 100644
index 00000000000..586de77edc8
--- /dev/null
+++ b/tensorflow/core/ops/state_ops_test.cc
@@ -0,0 +1,74 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(StateOpsTest, Assign_ShapeFn) {
+  ShapeInferenceTestOp op("Assign");
+
+  TF_ASSERT_OK(NodeDefBuilder("test", "Assign")
+                   .Input("ref", 0, DT_FLOAT_REF)
+                   .Input("value", 1, DT_FLOAT)
+                   .Attr("validate_shape", true)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[1,2];[1,2]", "in0");
+
+  // Resolves shapes when validate_shape is True.
+  INFER_OK(op, "[1,?];[?,2]", "[d0_0,d1_1]");
+
+  // validate_shape=True, fails when the shapes are not compatible.
+  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 3", op,
+              "[1,?];[3,2]");
+
+  // Test for validate_shape=False
+  TF_ASSERT_OK(NodeDefBuilder("test", "Assign")
+                   .Input("ref", 0, DT_FLOAT_REF)
+                   .Input("value", 1, DT_FLOAT)
+                   .Attr("validate_shape", false)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[1,2];[1,2,3,4]", "in1");
+}
+
+TEST(StateOpsTest, ScatterUpdate_ShapeFn) {
+  ShapeInferenceTestOp op("ScatterUpdate");
+  TF_ASSERT_OK(NodeDefBuilder("test", "ScatterUpdate")
+                   .Input("ref", 0, DT_FLOAT_REF)
+                   .Input("indices", 0, DT_INT32)
+                   .Input("updates", 1, DT_FLOAT)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[1,2];[3];[3,2]", "in0");
+
+  // Resolve shape on first updates dimension.
+  INFER_OK(op, "[1,2];[3];[?,2]", "in0");
+}
+
+TEST(StateOpsTest, TemporaryVariable_ShapeFn) {
+  ShapeInferenceTestOp op("TemporaryVariable");
+  TensorShape shape({1, 2, 3});
+  TensorShapeProto shape_proto;
+  shape.AsProto(&shape_proto);
+  TF_ASSERT_OK(NodeDefBuilder("test", "TemporaryVariable")
+                   .Attr("shape", shape_proto)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "", "[1,2,3]");
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/platform/context.h b/tensorflow/core/platform/context.h
index e6555029fd8..728ef916312 100644
--- a/tensorflow/core/platform/context.h
+++ b/tensorflow/core/platform/context.h
@@ -18,6 +18,13 @@ limitations under the License.
 
 namespace tensorflow {
 
+enum class ContextKind {
+  // Initial state with default (empty) values.
+  kDefault,
+  // Initial state inherited from the creating or scheduling thread.
+  kThread,
+};
+
 // Context is a container for request-specific information that should be passed
 // to threads that perform related work. The default constructor should capture
 // all relevant context.
diff --git a/tensorflow/core/platform/default/context.h b/tensorflow/core/platform/default/context.h
index 5d261ea9fbf..d8afeb47a9c 100644
--- a/tensorflow/core/platform/default/context.h
+++ b/tensorflow/core/platform/default/context.h
@@ -19,6 +19,9 @@ limitations under the License.
 namespace tensorflow {
 
 class Context {
+ public:
+  Context() {}
+  Context(const ContextKind kind) {}
 };
 
 class WithContext {
diff --git a/tensorflow/core/platform/default/dynamic_annotations.h b/tensorflow/core/platform/default/dynamic_annotations.h
index c86603117e7..d087035b5ab 100644
--- a/tensorflow/core/platform/default/dynamic_annotations.h
+++ b/tensorflow/core/platform/default/dynamic_annotations.h
@@ -19,9 +19,14 @@ limitations under the License.
 // IWYU pragma: private, include "third_party/tensorflow/core/platform/mem.h"
 // IWYU pragma: friend third_party/tensorflow/core/platform/mem.h
 
-// Do nothing for this platform
+// Do nothing for this platform.
+
 #define TF_ANNOTATE_MEMORY_IS_INITIALIZED(ptr, bytes) \
   do {                                                \
   } while (0)
 
+#define TF_ANNOTATE_BENIGN_RACE(ptr, description) \
+  do {                                            \
+  } while (0)
+
 #endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_DYNAMIC_ANNOTATIONS_H_
diff --git a/tensorflow/core/platform/default/protobuf.h b/tensorflow/core/platform/default/protobuf.h
index 544f62d1f8e..acc804a4eee 100644
--- a/tensorflow/core/platform/default/protobuf.h
+++ b/tensorflow/core/platform/default/protobuf.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "google/protobuf/descriptor.h"
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
+#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
 #include "google/protobuf/map.h"
 #include "google/protobuf/repeated_field.h"
 #include "google/protobuf/text_format.h"
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 3da8d3f1245..7cbcc40ccb8 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -188,9 +188,14 @@ TEST(EnvTest, GetSchemeForURI) {
 TEST(EnvTest, SleepForMicroseconds) {
   Env* env = Env::Default();
   const int64 start = env->NowMicros();
-  env->SleepForMicroseconds(1e6 + 5e5);
+  const int64 sleep_time = 1e6 + 5e5;
+  env->SleepForMicroseconds(sleep_time);
   const int64 delta = env->NowMicros() - start;
-  EXPECT_GE(delta, 1e6 + 5e5);
+
+  // Subtract 10 from the sleep_time for this check because NowMicros can
+  // sometimes give slightly inconsistent values between the start and the
+  // finish (e.g. because the two calls run on different CPUs).
+  EXPECT_GE(delta, sleep_time - 10);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_statistics.h b/tensorflow/core/platform/file_statistics.h
new file mode 100644
index 00000000000..cc781e0a7f2
--- /dev/null
+++ b/tensorflow/core/platform/file_statistics.h
@@ -0,0 +1,38 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+struct FileStatistics {
+  // The length of the file or -1 if finding file length is not supported.
+  int64 length = -1;
+  // The last modified time in nanoseconds.
+  int64 mtime_nsec = 0;
+  // This is the mode_t from stat.h containing file type and permission
+  // information.
+  mode_t mode = 0;
+
+  FileStatistics() {}
+  ~FileStatistics() {}
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index f372b379f53..51074768c5a 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/file_statistics.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -34,16 +35,6 @@ class RandomAccessFile;
 class ReadOnlyMemoryRegion;
 class WritableFile;
 
-struct FileStatistics {
-  // The length of the file or -1 if finding file length is not supported.
-  int64 length;
-  // The last modified time in nanoseconds.
-  int64 mtime_nsec;
-  // This field contains more than just the permissions bits.  More information
-  // can be found on the man page for stat(2).
-  mode_t mode;
-};
-
 /// A generic interface for accessing a file system.
 class FileSystem {
  public:
diff --git a/tensorflow/core/platform/fingerprint_test.cc b/tensorflow/core/platform/fingerprint_test.cc
index b275b0bf41f..e1ed59a85c5 100644
--- a/tensorflow/core/platform/fingerprint_test.cc
+++ b/tensorflow/core/platform/fingerprint_test.cc
@@ -43,7 +43,7 @@ TEST(Fingerprint128, IsForeverFrozen) {
 
 TEST(Fingerprint128, Fprint128Hasher) {
   // Tests that this compiles:
-  const std::unordered_set<Fprint128> map = {{1, 2}, {3, 4}};
+  const std::unordered_set<Fprint128, Fprint128Hasher> map = {{1, 2}, {3, 4}};
 }
 
 }  // namespace
diff --git a/tensorflow/core/lib/monitoring/counter.cc b/tensorflow/core/platform/net.h
similarity index 68%
rename from tensorflow/core/lib/monitoring/counter.cc
rename to tensorflow/core/platform/net.h
index 37960a4acd7..9e7851728dd 100644
--- a/tensorflow/core/lib/monitoring/counter.cc
+++ b/tensorflow/core/platform/net.h
@@ -13,19 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/lib/monitoring/counter.h"
-
-#include "tensorflow/core/platform/logging.h"
+#ifndef TENSORFLOW_PLATFORM_NET_H_
+#define TENSORFLOW_PLATFORM_NET_H_
 
 namespace tensorflow {
-namespace monitoring {
+namespace internal {
 
-void CounterCell::IncrementBy(const int64 step) {
-  DCHECK_LE(0, step) << "Must not decrement cumulative metrics.";
-  value_ += step;
-}
+int PickUnusedPortOrDie();
 
-int64 CounterCell::value() const { return value_; }
-
-}  // namespace monitoring
+}  // namespace internal
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_PLATFORM_NET_H_
diff --git a/tensorflow/core/platform/net_test.cc b/tensorflow/core/platform/net_test.cc
new file mode 100644
index 00000000000..475f4340167
--- /dev/null
+++ b/tensorflow/core/platform/net_test.cc
@@ -0,0 +1,34 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/net.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace internal {
+
+TEST(Net, PickUnusedPortOrDie) {
+  int port0 = PickUnusedPortOrDie();
+  int port1 = PickUnusedPortOrDie();
+  CHECK_GE(port0, 0);
+  CHECK_LT(port0, 65536);
+  CHECK_GE(port1, 0);
+  CHECK_LT(port1, 65536);
+  CHECK_NE(port0, port1);
+}
+
+}  // namespace internal
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/net.cc b/tensorflow/core/platform/posix/net.cc
new file mode 100644
index 00000000000..2f01b779341
--- /dev/null
+++ b/tensorflow/core/platform/posix/net.cc
@@ -0,0 +1,129 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/net.h"
+
+#include <cstdlib>
+#include <unordered_set>
+
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace internal {
+
+namespace {
+bool IsPortAvailable(int* port, bool is_tcp) {
+  const int protocol = is_tcp ? IPPROTO_TCP : 0;
+  const int fd = socket(AF_INET, is_tcp ? SOCK_STREAM : SOCK_DGRAM, protocol);
+
+  struct sockaddr_in addr;
+  socklen_t addr_len = sizeof(addr);
+  int actual_port;
+
+  CHECK_GE(*port, 0);
+  CHECK_LE(*port, 65535);
+  if (fd < 0) {
+    LOG(ERROR) << "socket() failed: " << strerror(errno);
+    return false;
+  }
+
+  // SO_REUSEADDR lets us start up a server immediately after it exists.
+  int one = 1;
+  if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0) {
+    LOG(ERROR) << "setsockopt() failed: " << strerror(errno);
+    close(fd);
+    return false;
+  }
+
+  // Try binding to port.
+  addr.sin_family = AF_INET;
+  addr.sin_addr.s_addr = INADDR_ANY;
+  addr.sin_port = htons((uint16_t)*port);
+  if (bind(fd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
+    LOG(WARNING) << "bind(port=" << *port << ") failed: " << strerror(errno);
+    close(fd);
+    return false;
+  }
+
+  // Get the bound port number.
+  if (getsockname(fd, (struct sockaddr*)&addr, &addr_len) < 0) {
+    LOG(WARNING) << "getsockname() failed: " << strerror(errno);
+    close(fd);
+    return false;
+  }
+  CHECK_LE(addr_len, sizeof(addr));
+  actual_port = ntohs(addr.sin_port);
+  CHECK_GT(actual_port, 0);
+  if (*port == 0) {
+    *port = actual_port;
+  } else {
+    CHECK_EQ(*port, actual_port);
+  }
+  close(fd);
+  return true;
+}
+
+const int kNumRandomPortsToPick = 100;
+const int kMaximumTrials = 1000;
+
+}  // namespace
+
+int PickUnusedPortOrDie() {
+  static std::unordered_set<int> chosen_ports;
+
+  // Type of port to first pick in the next iteration.
+  bool is_tcp = true;
+  int trial = 0;
+  while (true) {
+    int port;
+    trial++;
+    CHECK_LE(trial, kMaximumTrials)
+        << "Failed to pick an unused port for testing.";
+    if (trial == 1) {
+      port = getpid() % (65536 - 30000) + 30000;
+    } else if (trial <= kNumRandomPortsToPick) {
+      port = rand() % (65536 - 30000) + 30000;
+    } else {
+      port = 0;
+    }
+
+    if (chosen_ports.find(port) != chosen_ports.end()) {
+      continue;
+    }
+    if (!IsPortAvailable(&port, is_tcp)) {
+      continue;
+    }
+
+    CHECK_GT(port, 0);
+    if (!IsPortAvailable(&port, !is_tcp)) {
+      is_tcp = !is_tcp;
+      continue;
+    }
+
+    chosen_ports.insert(port);
+    return port;
+  }
+
+  return 0;
+}
+
+}  // namespace internal
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/test.cc b/tensorflow/core/platform/posix/test.cc
index fe16a898788..f83fccaa227 100644
--- a/tensorflow/core/platform/posix/test.cc
+++ b/tensorflow/core/platform/posix/test.cc
@@ -13,16 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/test.h"
-
-#include <cstdlib>
-#include <unordered_set>
-
-#include <netinet/in.h>
 #include <signal.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <unistd.h>
+
+#include "tensorflow/core/platform/net.h"
+#include "tensorflow/core/platform/test.h"
 
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -84,101 +78,7 @@ std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv) {
   return std::unique_ptr<SubProcess>(new PosixSubProcess(argv));
 }
 
-namespace {
-bool IsPortAvailable(int* port, bool is_tcp) {
-  const int protocol = is_tcp ? IPPROTO_TCP : 0;
-  const int fd = socket(AF_INET, is_tcp ? SOCK_STREAM : SOCK_DGRAM, protocol);
-
-  struct sockaddr_in addr;
-  socklen_t addr_len = sizeof(addr);
-  int actual_port;
-
-  CHECK_GE(*port, 0);
-  CHECK_LE(*port, 65535);
-  if (fd < 0) {
-    LOG(ERROR) << "socket() failed: " << strerror(errno);
-    return false;
-  }
-
-  // SO_REUSEADDR lets us start up a server immediately after it exists.
-  int one = 1;
-  if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0) {
-    LOG(ERROR) << "setsockopt() failed: " << strerror(errno);
-    close(fd);
-    return false;
-  }
-
-  // Try binding to port.
-  addr.sin_family = AF_INET;
-  addr.sin_addr.s_addr = INADDR_ANY;
-  addr.sin_port = htons((uint16_t)*port);
-  if (bind(fd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
-    LOG(WARNING) << "bind(port=" << *port << ") failed: " << strerror(errno);
-    close(fd);
-    return false;
-  }
-
-  // Get the bound port number.
-  if (getsockname(fd, (struct sockaddr*)&addr, &addr_len) < 0) {
-    LOG(WARNING) << "getsockname() failed: " << strerror(errno);
-    close(fd);
-    return false;
-  }
-  CHECK_LE(addr_len, sizeof(addr));
-  actual_port = ntohs(addr.sin_port);
-  CHECK_GT(actual_port, 0);
-  if (*port == 0) {
-    *port = actual_port;
-  } else {
-    CHECK_EQ(*port, actual_port);
-  }
-  close(fd);
-  return true;
-}
-
-const int kNumRandomPortsToPick = 100;
-const int kMaximumTrials = 1000;
-
-}  // namespace
-
-int PickUnusedPortOrDie() {
-  static std::unordered_set<int> chosen_ports;
-
-  // Type of port to first pick in the next iteration.
-  bool is_tcp = true;
-  int trial = 0;
-  while (true) {
-    int port;
-    trial++;
-    CHECK_LE(trial, kMaximumTrials)
-        << "Failed to pick an unused port for testing.";
-    if (trial == 1) {
-      port = getpid() % (65536 - 30000) + 30000;
-    } else if (trial <= kNumRandomPortsToPick) {
-      port = rand() % (65536 - 30000) + 30000;
-    } else {
-      port = 0;
-    }
-
-    if (chosen_ports.find(port) != chosen_ports.end()) {
-      continue;
-    }
-    if (!IsPortAvailable(&port, is_tcp)) {
-      continue;
-    }
-
-    CHECK_GT(port, 0);
-    if (!IsPortAvailable(&port, !is_tcp)) {
-      is_tcp = !is_tcp;
-      continue;
-    }
-
-    chosen_ports.insert(port);
-    return port;
-  }
-
-  return 0;
-}
+int PickUnusedPortOrDie() { return internal::PickUnusedPortOrDie(); }
 
 string TensorFlowSrcRoot() {
   // 'bazel test' sets TEST_SRCDIR, and also TEST_WORKSPACE if a new
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 33f38019d01..6f78f8cd8a9 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -180,7 +180,7 @@ message ConfigProto {
   int64 operation_timeout_in_ms = 11;
 };
 
-// EXPERIMENTAL. Option for watching a node
+// EXPERIMENTAL. Option for watching a node.
 message DebugTensorWatch {
   // Name of the node to watch.
   string node_name = 1;
@@ -196,6 +196,12 @@ message DebugTensorWatch {
   // One or more than one probes on a tensor.
   // e.g., {"DebugIdentity", "DebugNanCount"}
   repeated string debug_ops = 3;
+
+  // URL(s) for debug targets(s).
+  //   E.g., "file:///foo/tfdbg_dump", "grpc://localhost:11011"
+  // Each debug op listed in debug_ops will publish its output tensor (debug
+  // signal) to all URLs in debug_urls.
+  repeated string debug_urls = 4;
 }
 
 // EXPERIMENTAL. Options for a single Run() call.
diff --git a/tensorflow/core/protobuf/saver.proto b/tensorflow/core/protobuf/saver.proto
index b130c7343b4..c6b5e1c938a 100644
--- a/tensorflow/core/protobuf/saver.proto
+++ b/tensorflow/core/protobuf/saver.proto
@@ -6,7 +6,7 @@ option java_outer_classname = "SaverProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
 
-// Protocol buffer representing the configuration of a SaveRestoreHelper.
+// Protocol buffer representing the configuration of a Saver.
 message SaverDef {
   // The name of the tensor in which to specify the filename when saving or
   // restoring a model checkpoint.
diff --git a/tensorflow/core/public/README.md b/tensorflow/core/public/README.md
index cd1cefbb158..45767e1c8c8 100644
--- a/tensorflow/core/public/README.md
+++ b/tensorflow/core/public/README.md
@@ -21,7 +21,7 @@ Then:
 ```python
 import tensorflow as tf
 
-with tf.Session("local"):
+with tf.Session():
   input1 = tf.constant(1.0, shape=[1, 1], name="input1")
   input2 = tf.constant(2.0, shape=[1, 1], name="input2")
   output = tf.matmul(input1, input2)
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 443eabaee02..3f6fa3826a0 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,8 +19,8 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 0
-#define TF_MINOR_VERSION 9
-#define TF_PATCH_VERSION 0
+#define TF_MINOR_VERSION 10
+#define TF_PATCH_VERSION 0rc0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
@@ -64,9 +64,10 @@ limitations under the License.
 // 8. Replace RandomCrop from C++ with pure Python (5feb2016).
 // 9. Deprecate batch_norm_with_global_normalization (16feb2016).
 // 10. Deprecate conv3d_backprop_{filter,input} (10jun2016).
+// 11. Deprecate {batch}_self_adjoint_eig (3aug2016).
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 10
+#define TF_GRAPH_DEF_VERSION 11
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc
index f2072d7b211..397ee8bb7d8 100644
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@@ -230,8 +230,11 @@ Status SingleExampleProtoToTensors(
     const Tensor& default_value = feature_config.default_value;
     bool required = (default_value.NumElements() == 0);
     const auto& feature_found = feature_dict.find(key);
+    const bool feature_has_data =  // Found key & data type is set
+        (feature_found != feature_dict.end() &&
+         (feature_found->second.kind_case() != Feature::KIND_NOT_SET));
 
-    bool required_ok = (feature_found != feature_dict.end()) || !required;
+    const bool required_ok = feature_has_data || !required;
     if (!required_ok) {
       return errors::InvalidArgument("Name: ", example_name, ", Feature: ", key,
                                      " is required but could not be found.");
@@ -239,7 +242,7 @@ Status SingleExampleProtoToTensors(
 
     // Perform the FeatureDenseCopy into the output dense_values tensor (if
     // the value is present).
-    if (feature_found != feature_dict.end()) {
+    if (feature_has_data) {
       const Feature& f = feature_found->second;
       bool types_match;
       TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match));
@@ -266,7 +269,7 @@ Status SingleExampleProtoToTensors(
     const DataType& dtype = feature_config.dtype;
     const auto& feature_found = feature_dict.find(key);
 
-    bool feature_has_data =  // Found key & data type is set
+    const bool feature_has_data =  // Found key & data type is set
         (feature_found != feature_dict.end() &&
          (feature_found->second.kind_case() != Feature::KIND_NOT_SET));
 
@@ -318,9 +321,9 @@ Status BatchExampleProtoToTensors(
     std::vector<Tensor>* output_sparse_indices_tensor,
     std::vector<Tensor>* output_sparse_values_tensor,
     std::vector<Tensor>* output_sparse_shapes_tensor) {
-  int batch_size = examples.size();
+  const int batch_size = examples.size();
 
-  bool has_names = (names.size() > 0);
+  const bool has_names = (names.size() > 0);
   if (has_names) {
     if (names.size() != examples.size()) {
       return errors::InvalidArgument(
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 4992c772848..79064e9988d 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -150,7 +150,7 @@ class SparseTensor {
   // Picks out the dimensions according to `dim_indices`.
   std::vector<int64> PickDims(gtl::ArraySlice<int64> dim_indices) {
     std::vector<int64> res(dim_indices.size());
-    for (int i = 0; i < dim_indices.size(); ++i) {
+    for (size_t i = 0; i < dim_indices.size(); ++i) {
       res[i] = shape_.dim_size(dim_indices[i]);
     }
     return res;
diff --git a/tensorflow/examples/how_tos/reading_data/convert_to_records.py b/tensorflow/examples/how_tos/reading_data/convert_to_records.py
index 2e3035731ad..566d554e7f3 100644
--- a/tensorflow/examples/how_tos/reading_data/convert_to_records.py
+++ b/tensorflow/examples/how_tos/reading_data/convert_to_records.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import numpy
 import tensorflow as tf
 from tensorflow.contrib.learn.python.learn.datasets import mnist
 
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
index 8a43158062c..9a33afd93ab 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
@@ -30,10 +30,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os.path
 import time
-
-import numpy
 import tensorflow as tf
 
 from tensorflow.examples.tutorials.mnist import input_data
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py
index 0711bed920f..b4c80e53b66 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py
@@ -29,10 +29,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os.path
 import time
-
-import numpy
 import tensorflow as tf
 
 from tensorflow.examples.tutorials.mnist import input_data
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index bdd821373fd..351d531e253 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -29,8 +29,6 @@ from __future__ import print_function
 
 import os.path
 import time
-
-import numpy
 import tensorflow as tf
 
 from tensorflow.examples.tutorials.mnist import mnist
diff --git a/tensorflow/examples/image_retraining/retrain_test.py b/tensorflow/examples/image_retraining/retrain_test.py
index 91108abde09..072998ae600 100644
--- a/tensorflow/examples/image_retraining/retrain_test.py
+++ b/tensorflow/examples/image_retraining/retrain_test.py
@@ -18,12 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import tensorflow as tf
 
 from tensorflow.examples.image_retraining import retrain
 from tensorflow.python.framework import test_util
-from tensorflow.python.platform import googletest
 
 
 class ImageRetrainingTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/examples/learn/wide_n_deep_tutorial.py b/tensorflow/examples/learn/wide_n_deep_tutorial.py
index f80b839156c..5a23087b5a7 100644
--- a/tensorflow/examples/learn/wide_n_deep_tutorial.py
+++ b/tensorflow/examples/learn/wide_n_deep_tutorial.py
@@ -59,7 +59,7 @@ def maybe_download():
     urllib.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)  # pylint: disable=line-too-long
     train_file_name = train_file.name
     train_file.close()
-    print("Training data is downlaoded to %s" % train_file_name)
+    print("Training data is downloaded to %s" % train_file_name)
 
   if FLAGS.test_data:
     test_file_name = FLAGS.test_data
@@ -68,7 +68,7 @@ def maybe_download():
     urllib.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)  # pylint: disable=line-too-long
     test_file_name = test_file.name
     test_file.close()
-    print("Test data is downlaoded to %s" % test_file_name)
+    print("Test data is downloaded to %s" % test_file_name)
 
   return train_file_name, test_file_name
 
diff --git a/tensorflow/examples/skflow/multioutput_regression.py b/tensorflow/examples/skflow/multioutput_regression.py
index ef76a6ce270..cf978e23d4d 100644
--- a/tensorflow/examples/skflow/multioutput_regression.py
+++ b/tensorflow/examples/skflow/multioutput_regression.py
@@ -23,7 +23,6 @@ from __future__ import print_function
 
 import numpy as np
 import matplotlib.pyplot as plt
-from sklearn import datasets
 from sklearn.metrics import mean_squared_error
 
 from tensorflow.contrib import learn
diff --git a/tensorflow/examples/tutorials/monitors/iris_monitors.py b/tensorflow/examples/tutorials/monitors/iris_monitors.py
new file mode 100644
index 00000000000..43ec271b598
--- /dev/null
+++ b/tensorflow/examples/tutorials/monitors/iris_monitors.py
@@ -0,0 +1,70 @@
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Model training for Iris data set using Validation Monitor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+tf.logging.set_verbosity(tf.logging.INFO)
+
+# Data sets
+IRIS_TRAINING = "iris_training.csv"
+IRIS_TEST = "iris_test.csv"
+
+# Load datasets.
+training_set = tf.contrib.learn.datasets.base.load_csv(filename=IRIS_TRAINING,
+                                                       target_dtype=np.int)
+test_set = tf.contrib.learn.datasets.base.load_csv(filename=IRIS_TEST,
+                                                   target_dtype=np.int)
+
+validation_metrics = {"accuracy": tf.contrib.metrics.streaming_accuracy,
+                      "precision": tf.contrib.metrics.streaming_precision,
+                      "recall": tf.contrib.metrics.streaming_recall}
+validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
+    test_set.data,
+    test_set.target,
+    every_n_steps=50,
+    metrics=validation_metrics,
+    early_stopping_metric="loss",
+    early_stopping_metric_minimize=True,
+    early_stopping_rounds=200)
+
+# Build 3 layer DNN with 10, 20, 10 units respectively.
+classifier = tf.contrib.learn.DNNClassifier(hidden_units=[10, 20, 10],
+                                            n_classes=3,
+                                            model_dir="/tmp/iris_model",
+                                            config=tf.contrib.learn.RunConfig(
+                                                save_checkpoints_secs=1))
+
+# Fit model.
+classifier.fit(x=training_set.data,
+               y=training_set.target,
+               steps=2000,
+               monitors=[validation_monitor])
+
+# Evaluate accuracy.
+accuracy_score = classifier.evaluate(x=test_set.data,
+                                     y=test_set.target)["accuracy"]
+print("Accuracy: {0:f}".format(accuracy_score))
+
+# Classify two new flower samples.
+new_samples = np.array(
+    [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
+y = classifier.predict(new_samples)
+print("Predictions: {}".format(str(y)))
diff --git a/tensorflow/examples/tutorials/monitors/iris_test.csv b/tensorflow/examples/tutorials/monitors/iris_test.csv
new file mode 100644
index 00000000000..5929d91f52e
--- /dev/null
+++ b/tensorflow/examples/tutorials/monitors/iris_test.csv
@@ -0,0 +1,31 @@
+30,4,setosa,versicolor,virginica
+5.9,3.0,4.2,1.5,1
+6.9,3.1,5.4,2.1,2
+5.1,3.3,1.7,0.5,0
+6.0,3.4,4.5,1.6,1
+5.5,2.5,4.0,1.3,1
+6.2,2.9,4.3,1.3,1
+5.5,4.2,1.4,0.2,0
+6.3,2.8,5.1,1.5,2
+5.6,3.0,4.1,1.3,1
+6.7,2.5,5.8,1.8,2
+7.1,3.0,5.9,2.1,2
+4.3,3.0,1.1,0.1,0
+5.6,2.8,4.9,2.0,2
+5.5,2.3,4.0,1.3,1
+6.0,2.2,4.0,1.0,1
+5.1,3.5,1.4,0.2,0
+5.7,2.6,3.5,1.0,1
+4.8,3.4,1.9,0.2,0
+5.1,3.4,1.5,0.2,0
+5.7,2.5,5.0,2.0,2
+5.4,3.4,1.7,0.2,0
+5.6,3.0,4.5,1.5,1
+6.3,2.9,5.6,1.8,2
+6.3,2.5,4.9,1.5,1
+5.8,2.7,3.9,1.2,1
+6.1,3.0,4.6,1.4,1
+5.2,4.1,1.5,0.1,0
+6.7,3.1,4.7,1.5,1
+6.7,3.3,5.7,2.5,2
+6.4,2.9,4.3,1.3,1
diff --git a/tensorflow/examples/tutorials/monitors/iris_training.csv b/tensorflow/examples/tutorials/monitors/iris_training.csv
new file mode 100644
index 00000000000..f5ae1054a16
--- /dev/null
+++ b/tensorflow/examples/tutorials/monitors/iris_training.csv
@@ -0,0 +1,121 @@
+120,4,setosa,versicolor,virginica
+6.4,2.8,5.6,2.2,2
+5.0,2.3,3.3,1.0,1
+4.9,2.5,4.5,1.7,2
+4.9,3.1,1.5,0.1,0
+5.7,3.8,1.7,0.3,0
+4.4,3.2,1.3,0.2,0
+5.4,3.4,1.5,0.4,0
+6.9,3.1,5.1,2.3,2
+6.7,3.1,4.4,1.4,1
+5.1,3.7,1.5,0.4,0
+5.2,2.7,3.9,1.4,1
+6.9,3.1,4.9,1.5,1
+5.8,4.0,1.2,0.2,0
+5.4,3.9,1.7,0.4,0
+7.7,3.8,6.7,2.2,2
+6.3,3.3,4.7,1.6,1
+6.8,3.2,5.9,2.3,2
+7.6,3.0,6.6,2.1,2
+6.4,3.2,5.3,2.3,2
+5.7,4.4,1.5,0.4,0
+6.7,3.3,5.7,2.1,2
+6.4,2.8,5.6,2.1,2
+5.4,3.9,1.3,0.4,0
+6.1,2.6,5.6,1.4,2
+7.2,3.0,5.8,1.6,2
+5.2,3.5,1.5,0.2,0
+5.8,2.6,4.0,1.2,1
+5.9,3.0,5.1,1.8,2
+5.4,3.0,4.5,1.5,1
+6.7,3.0,5.0,1.7,1
+6.3,2.3,4.4,1.3,1
+5.1,2.5,3.0,1.1,1
+6.4,3.2,4.5,1.5,1
+6.8,3.0,5.5,2.1,2
+6.2,2.8,4.8,1.8,2
+6.9,3.2,5.7,2.3,2
+6.5,3.2,5.1,2.0,2
+5.8,2.8,5.1,2.4,2
+5.1,3.8,1.5,0.3,0
+4.8,3.0,1.4,0.3,0
+7.9,3.8,6.4,2.0,2
+5.8,2.7,5.1,1.9,2
+6.7,3.0,5.2,2.3,2
+5.1,3.8,1.9,0.4,0
+4.7,3.2,1.6,0.2,0
+6.0,2.2,5.0,1.5,2
+4.8,3.4,1.6,0.2,0
+7.7,2.6,6.9,2.3,2
+4.6,3.6,1.0,0.2,0
+7.2,3.2,6.0,1.8,2
+5.0,3.3,1.4,0.2,0
+6.6,3.0,4.4,1.4,1
+6.1,2.8,4.0,1.3,1
+5.0,3.2,1.2,0.2,0
+7.0,3.2,4.7,1.4,1
+6.0,3.0,4.8,1.8,2
+7.4,2.8,6.1,1.9,2
+5.8,2.7,5.1,1.9,2
+6.2,3.4,5.4,2.3,2
+5.0,2.0,3.5,1.0,1
+5.6,2.5,3.9,1.1,1
+6.7,3.1,5.6,2.4,2
+6.3,2.5,5.0,1.9,2
+6.4,3.1,5.5,1.8,2
+6.2,2.2,4.5,1.5,1
+7.3,2.9,6.3,1.8,2
+4.4,3.0,1.3,0.2,0
+7.2,3.6,6.1,2.5,2
+6.5,3.0,5.5,1.8,2
+5.0,3.4,1.5,0.2,0
+4.7,3.2,1.3,0.2,0
+6.6,2.9,4.6,1.3,1
+5.5,3.5,1.3,0.2,0
+7.7,3.0,6.1,2.3,2
+6.1,3.0,4.9,1.8,2
+4.9,3.1,1.5,0.1,0
+5.5,2.4,3.8,1.1,1
+5.7,2.9,4.2,1.3,1
+6.0,2.9,4.5,1.5,1
+6.4,2.7,5.3,1.9,2
+5.4,3.7,1.5,0.2,0
+6.1,2.9,4.7,1.4,1
+6.5,2.8,4.6,1.5,1
+5.6,2.7,4.2,1.3,1
+6.3,3.4,5.6,2.4,2
+4.9,3.1,1.5,0.1,0
+6.8,2.8,4.8,1.4,1
+5.7,2.8,4.5,1.3,1
+6.0,2.7,5.1,1.6,1
+5.0,3.5,1.3,0.3,0
+6.5,3.0,5.2,2.0,2
+6.1,2.8,4.7,1.2,1
+5.1,3.5,1.4,0.3,0
+4.6,3.1,1.5,0.2,0
+6.5,3.0,5.8,2.2,2
+4.6,3.4,1.4,0.3,0
+4.6,3.2,1.4,0.2,0
+7.7,2.8,6.7,2.0,2
+5.9,3.2,4.8,1.8,1
+5.1,3.8,1.6,0.2,0
+4.9,3.0,1.4,0.2,0
+4.9,2.4,3.3,1.0,1
+4.5,2.3,1.3,0.3,0
+5.8,2.7,4.1,1.0,1
+5.0,3.4,1.6,0.4,0
+5.2,3.4,1.4,0.2,0
+5.3,3.7,1.5,0.2,0
+5.0,3.6,1.4,0.2,0
+5.6,2.9,3.6,1.3,1
+4.8,3.1,1.6,0.2,0
+6.3,2.7,4.9,1.8,2
+5.7,2.8,4.1,1.3,1
+5.0,3.0,1.6,0.2,0
+6.3,3.3,6.0,2.5,2
+5.0,3.5,1.6,0.6,0
+5.5,2.6,4.4,1.2,1
+5.7,3.0,4.2,1.2,1
+4.4,2.9,1.4,0.2,0
+4.8,3.0,1.4,0.1,0
+5.5,2.4,3.7,1.0,1
diff --git a/tensorflow/g3doc/api_docs/cc/ClassEnv.md b/tensorflow/g3doc/api_docs/cc/ClassEnv.md
index 1e5b0ade49a..0010c0fbb23 100644
--- a/tensorflow/g3doc/api_docs/cc/ClassEnv.md
+++ b/tensorflow/g3doc/api_docs/cc/ClassEnv.md
@@ -38,7 +38,7 @@ Returns the file system schemes registered for this Env .
 
 
 
-#### `Status tensorflow::Env::NewRandomAccessFile(const string &fname, RandomAccessFile **result)` {#Status_tensorflow_Env_NewRandomAccessFile}
+#### `Status tensorflow::Env::NewRandomAccessFile(const string &fname, std::unique_ptr< RandomAccessFile > *result)` {#Status_tensorflow_Env_NewRandomAccessFile}
 
 Creates a brand new random access read-only file with the specified name.
 
@@ -48,7 +48,7 @@ The returned file may be concurrently accessed by multiple threads.
 
 The ownership of the returned RandomAccessFile is passed to the caller and the object should be deleted when is not used. The file object shouldn&apos;t live longer than the Env object.
 
-#### `Status tensorflow::Env::NewWritableFile(const string &fname, WritableFile **result)` {#Status_tensorflow_Env_NewWritableFile}
+#### `Status tensorflow::Env::NewWritableFile(const string &fname, std::unique_ptr< WritableFile > *result)` {#Status_tensorflow_Env_NewWritableFile}
 
 Creates an object that writes to a new file with the specified name.
 
@@ -58,7 +58,7 @@ The returned file will only be accessed by one thread at a time.
 
 The ownership of the returned WritableFile is passed to the caller and the object should be deleted when is not used. The file object shouldn&apos;t live longer than the Env object.
 
-#### `Status tensorflow::Env::NewAppendableFile(const string &fname, WritableFile **result)` {#Status_tensorflow_Env_NewAppendableFile}
+#### `Status tensorflow::Env::NewAppendableFile(const string &fname, std::unique_ptr< WritableFile > *result)` {#Status_tensorflow_Env_NewAppendableFile}
 
 Creates an object that either appends to an existing file, or writes to a new file (if the file does not exist to begin with).
 
@@ -68,7 +68,7 @@ The returned file will only be accessed by one thread at a time.
 
 The ownership of the returned WritableFile is passed to the caller and the object should be deleted when is not used. The file object shouldn&apos;t live longer than the Env object.
 
-#### `Status tensorflow::Env::NewReadOnlyMemoryRegionFromFile(const string &fname, ReadOnlyMemoryRegion **result)` {#Status_tensorflow_Env_NewReadOnlyMemoryRegionFromFile}
+#### `Status tensorflow::Env::NewReadOnlyMemoryRegionFromFile(const string &fname, std::unique_ptr< ReadOnlyMemoryRegion > *result)` {#Status_tensorflow_Env_NewReadOnlyMemoryRegionFromFile}
 
 Creates a readonly region of memory with the file context.
 
@@ -96,6 +96,20 @@ Deletes the named file.
 
 
 
+#### `Status tensorflow::Env::DeleteRecursively(const string &dirname, int64 *undeleted_files, int64 *undeleted_dirs)` {#Status_tensorflow_Env_DeleteRecursively}
+
+Deletes the specified directory and all subdirectories and files underneath it. undeleted_files and undeleted_dirs stores the number of files and directories that weren&apos;t deleted (unspecified if the return status is not OK). REQUIRES: undeleted_files, undeleted_dirs to be not null. Typical return codes.
+
+
+
+OK - dirname exists and we were able to delete everything underneath.
+
+NOT_FOUND - dirname doesn&apos;t exist
+
+PERMISSION_DENIED - dirname or some descendant is not writable
+
+UNIMPLEMENTED - Some underlying functions (like Delete) are not implemented
+
 #### `Status tensorflow::Env::CreateDir(const string &dirname)` {#Status_tensorflow_Env_CreateDir}
 
 Creates the specified directory.
@@ -108,6 +122,28 @@ Deletes the specified directory.
 
 
 
+#### `Status tensorflow::Env::Stat(const string &fname, FileStatistics *stat)` {#Status_tensorflow_Env_Stat}
+
+Obtains statistics for the given path.
+
+
+
+#### `Status tensorflow::Env::IsDirectory(const string &fname)` {#Status_tensorflow_Env_IsDirectory}
+
+Returns whether the given path is a directory or not. Typical return codes (not guaranteed exhaustive):
+
+
+
+OK - The path exists and is a directory.
+
+FAILED_PRECONDITION - The path exists and is not a directory.
+
+NOT_FOUND - The path entry does not exist.
+
+PERMISSION_DENIED - Insufficient permissions.
+
+UNIMPLEMENTED - The file factory doesn&apos;t support directories.
+
 #### `Status tensorflow::Env::GetFileSize(const string &fname, uint64 *file_size)` {#Status_tensorflow_Env_GetFileSize}
 
 Stores the size of `fname` in `*file_size`.
@@ -126,7 +162,13 @@ Returns the number of micro-seconds since some fixed point in time. Only useful
 
 
 
-#### `virtual void tensorflow::Env::SleepForMicroseconds(int micros)=0` {#virtual_void_tensorflow_Env_SleepForMicroseconds}
+#### `virtual uint64 tensorflow::Env::NowSeconds()` {#virtual_uint64_tensorflow_Env_NowSeconds}
+
+Returns the number of seconds since some fixed point in time. Only useful for computing deltas of time.
+
+
+
+#### `virtual void tensorflow::Env::SleepForMicroseconds(int64 micros)=0` {#virtual_void_tensorflow_Env_SleepForMicroseconds}
 
 Sleeps/delays the thread for the prescribed number of micro-seconds.
 
@@ -144,7 +186,7 @@ Caller takes ownership of the result and must delete it eventually (the deletion
 
 
 
-#### `virtual void tensorflow::Env::SchedClosureAfter(int micros, std::function< void()> closure)=0` {#virtual_void_tensorflow_Env_SchedClosureAfter}
+#### `virtual void tensorflow::Env::SchedClosureAfter(int64 micros, std::function< void()> closure)=0` {#virtual_void_tensorflow_Env_SchedClosureAfter}
 
 
 
diff --git a/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md b/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md
index 2e284ac8159..f0041f5be92 100644
--- a/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md
+++ b/tensorflow/g3doc/api_docs/cc/ClassEnvWrapper.md
@@ -48,7 +48,7 @@ Returns the number of micro-seconds since some fixed point in time. Only useful
 
 
 
-#### `void tensorflow::EnvWrapper::SleepForMicroseconds(int micros) override` {#void_tensorflow_EnvWrapper_SleepForMicroseconds}
+#### `void tensorflow::EnvWrapper::SleepForMicroseconds(int64 micros) override` {#void_tensorflow_EnvWrapper_SleepForMicroseconds}
 
 Sleeps/delays the thread for the prescribed number of micro-seconds.
 
@@ -66,7 +66,7 @@ Caller takes ownership of the result and must delete it eventually (the deletion
 
 
 
-#### `void tensorflow::EnvWrapper::SchedClosureAfter(int micros, std::function< void()> closure) override` {#void_tensorflow_EnvWrapper_SchedClosureAfter}
+#### `void tensorflow::EnvWrapper::SchedClosureAfter(int64 micros, std::function< void()> closure) override` {#void_tensorflow_EnvWrapper_SchedClosureAfter}
 
 
 
diff --git a/tensorflow/g3doc/api_docs/cc/ClassTensor.md b/tensorflow/g3doc/api_docs/cc/ClassTensor.md
index cc271aae374..e221a026935 100644
--- a/tensorflow/g3doc/api_docs/cc/ClassTensor.md
+++ b/tensorflow/g3doc/api_docs/cc/ClassTensor.md
@@ -8,9 +8,13 @@ Represents an n-dimensional array of values.
 
 #### `tensorflow::Tensor::Tensor()` {#tensorflow_Tensor_Tensor}
 
-Default Tensor constructor. Creates a 1-dimension, 0-element float tensor.
+Creates a 1-dimensional, 0-element float tensor.
 
+The returned Tensor is not a scalar (shape {}), but is instead an empty one-dimensional Tensor (shape {0}, NumElements() == 0). Since it has no elements, it does not need to be assigned a value and is initialized by default ( IsInitialized() is true). If this is undesirable, consider creating a one-element scalar which does require initialization:
 
+```c++ Tensor(DT_FLOAT, TensorShape({}))
+
+```
 
 #### `tensorflow::Tensor::Tensor(DataType type, const TensorShape &shape)` {#tensorflow_Tensor_Tensor}
 
@@ -32,9 +36,9 @@ Creates a tensor with the input `type` and `shape`, using the allocator `a` and
 
 #### `tensorflow::Tensor::Tensor(DataType type)` {#tensorflow_Tensor_Tensor}
 
-Creates an uninitialized Tensor of the given data type.
-
+Creates an empty Tensor of the given data type.
 
+Like Tensor() , returns a 1-dimensional, 0-element Tensor with IsInitialized() returning True. See the Tensor() documentation for details.
 
 #### `tensorflow::Tensor::Tensor(const Tensor &other)` {#tensorflow_Tensor_Tensor}
 
@@ -42,12 +46,18 @@ Creates an uninitialized Tensor of the given data type.
 
 
 
-#### `tensorflow::Tensor::~Tensor()` {#tensorflow_Tensor_Tensor}
+#### `tensorflow::Tensor::Tensor(Tensor &&other)` {#tensorflow_Tensor_Tensor}
 
 Copy constructor.
 
 
 
+#### `tensorflow::Tensor::~Tensor()` {#tensorflow_Tensor_Tensor}
+
+
+
+
+
 #### `DataType tensorflow::Tensor::dtype() const` {#DataType_tensorflow_Tensor_dtype}
 
 Returns the data type.
@@ -98,9 +108,9 @@ Convenience accessor for the tensor shape.
 
 #### `bool tensorflow::Tensor::IsInitialized() const` {#bool_tensorflow_Tensor_IsInitialized}
 
-Has this Tensor been initialized?
-
+If necessary, has this Tensor been initialized?
 
+Zero-element Tensors are always considered initialized, even if they have never been assigned to and do not have any memory allocated.
 
 #### `size_t tensorflow::Tensor::TotalBytes() const` {#size_t_tensorflow_Tensor_TotalBytes}
 
@@ -120,6 +130,12 @@ Assign operator. This tensor shares other&apos;s underlying storage.
 
 
 
+#### `Tensor & tensorflow::Tensor::operator=(Tensor &&other)` {#Tensor_tensorflow_Tensor_operator_}
+
+Move operator. See move constructor for details.
+
+
+
 #### `bool tensorflow::Tensor::CopyFrom(const Tensor &other, const TensorShape &shape) TF_MUST_USE_RESULT` {#bool_tensorflow_Tensor_CopyFrom}
 
 Copy the other tensor into this tensor and reshape it.
@@ -190,6 +206,12 @@ auto mat = my_mat.matrix<int32>();// CHECK fails as type mismatch.
 
 
 
+#### `TTypes< T, NDIMS >::Tensor tensorflow::Tensor::bit_casted_tensor()` {#TTypes_T_NDIMS_Tensor_tensorflow_Tensor_bit_casted_tensor}
+
+Return the tensor data to an `Eigen::Tensor` with the same size but a bitwise cast to the specified dtype `T`.
+
+Using a bitcast is useful for move and copy operations. NOTE: this is the same as `tensor()` except a bitcast is allowed.
+
 #### `TTypes<T>::Flat tensorflow::Tensor::flat()` {#TTypes_T_Flat_tensorflow_Tensor_flat}
 
 Return the tensor data as an `Eigen::Tensor` of the data type and a specified shape.
@@ -239,6 +261,12 @@ Returns the data as an Eigen::Tensor with NDIMS dimensions, collapsing all Tenso
 
 
 
+#### `TTypes< T, NDIMS >::Tensor tensorflow::Tensor::bit_casted_shaped(gtl::ArraySlice< int64 > new_sizes)` {#TTypes_T_NDIMS_Tensor_tensorflow_Tensor_bit_casted_shaped}
+
+Return the tensor data to an `Eigen::Tensor` with the new shape specified in `new_sizes` and cast to a new dtype `T`.
+
+Using a bitcast is useful for move and copy operations. The allowed bitcast is the only difference from `shaped()`.
+
 #### `TTypes< T, NDIMS >::UnalignedTensor tensorflow::Tensor::unaligned_shaped(gtl::ArraySlice< int64 > new_sizes)` {#TTypes_T_NDIMS_UnalignedTensor_tensorflow_Tensor_unaligned_shaped}
 
 
@@ -269,6 +297,12 @@ Const versions of all the methods above.
 
 
 
+#### `TTypes< T, NDIMS >::ConstTensor tensorflow::Tensor::bit_casted_tensor() const` {#TTypes_T_NDIMS_ConstTensor_tensorflow_Tensor_bit_casted_tensor}
+
+Return the tensor data to an `Eigen::Tensor` with the same size but a bitwise cast to the specified dtype `T`.
+
+Using a bitcast is useful for move and copy operations. NOTE: this is the same as `tensor()` except a bitcast is allowed.
+
 #### `TTypes<T>::ConstFlat tensorflow::Tensor::flat() const` {#TTypes_T_ConstFlat_tensorflow_Tensor_flat}
 
 
@@ -287,6 +321,12 @@ Const versions of all the methods above.
 
 
 
+#### `TTypes< T, NDIMS >::ConstTensor tensorflow::Tensor::bit_casted_shaped(gtl::ArraySlice< int64 > new_sizes) const` {#TTypes_T_NDIMS_ConstTensor_tensorflow_Tensor_bit_casted_shaped}
+
+Return the tensor data to an `Eigen::Tensor` with the new shape specified in `new_sizes` and cast to a new dtype `T`.
+
+Using a bitcast is useful for move and copy operations. The allowed bitcast is the only difference from `shaped()`.
+
 #### `TTypes< T, NDIMS >::UnalignedConstTensor tensorflow::Tensor::unaligned_shaped(gtl::ArraySlice< int64 > new_sizes) const` {#TTypes_T_NDIMS_UnalignedConstTensor_tensorflow_Tensor_unaligned_shaped}
 
 
@@ -337,7 +377,7 @@ The returned ` StringPiece ` may point to memory location on devices that the CP
 
 NOTE: The underlying tensor buffer is refcounted, so the lifetime of the contents mapped by the ` StringPiece ` matches the lifetime of the buffer; callers should arrange to make sure the buffer does not get destroyed while the ` StringPiece ` is still used.
 
-REQUIRES: `DataTypeCanUseMemcpy( dtype() )`.
+REQUIRES: `DataTypeCanUseMemcpy(dtype())`.
 
 #### `void tensorflow::Tensor::UnsafeCopyFromInternal(const Tensor &, const TensorShape &)` {#void_tensorflow_Tensor_UnsafeCopyFromInternal}
 
diff --git a/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md b/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md
index d0be205c3b0..5eba11a0df7 100644
--- a/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md
+++ b/tensorflow/g3doc/api_docs/cc/ClassTensorShape.md
@@ -60,6 +60,18 @@ Copy the specified shape.
 
 
 
+#### `tensorflow::TensorShape::TensorShape(TensorShape &&b)` {#tensorflow_TensorShape_TensorShape}
+
+Move the specified shape. After moving, is safe for destruction and.
+
+
+
+#### `void tensorflow::TensorShape::operator=(TensorShape &&b)` {#void_tensorflow_TensorShape_operator_}
+
+
+
+
+
 #### `void tensorflow::TensorShape::Clear()` {#void_tensorflow_TensorShape_Clear}
 
 Clear a tensor shape.
diff --git a/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md b/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md
index 6010dd48b7e..761feccae20 100644
--- a/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md
+++ b/tensorflow/g3doc/api_docs/cc/ClassTensorShapeUtils.md
@@ -36,13 +36,25 @@ Static helper routines for ` TensorShape `. Includes a few common predicates on
 
 
 
-#### `static Status tensorflow::TensorShapeUtils::MakeShape(const int32 *dims, int n, TensorShape *out)` {#static_Status_tensorflow_TensorShapeUtils_MakeShape}
+#### `static Status tensorflow::TensorShapeUtils::MakeShape(const int32 *dims, int64 n, TensorShape *out)` {#static_Status_tensorflow_TensorShapeUtils_MakeShape}
 
 Returns a ` TensorShape ` whose dimensions are `dims[0]`, `dims[1]`, ..., `dims[n-1]`.
 
 
 
-#### `static Status tensorflow::TensorShapeUtils::MakeShape(const int64 *dims, int n, TensorShape *out)` {#static_Status_tensorflow_TensorShapeUtils_MakeShape}
+#### `static Status tensorflow::TensorShapeUtils::MakeShape(const int64 *dims, int64 n, TensorShape *out)` {#static_Status_tensorflow_TensorShapeUtils_MakeShape}
+
+
+
+
+
+#### `static Status tensorflow::TensorShapeUtils::MakeShape(gtl::ArraySlice< int32 > shape, TensorShape *out)` {#static_Status_tensorflow_TensorShapeUtils_MakeShape}
+
+
+
+
+
+#### `static Status tensorflow::TensorShapeUtils::MakeShape(gtl::ArraySlice< int64 > shape, TensorShape *out)` {#static_Status_tensorflow_TensorShapeUtils_MakeShape}
 
 
 
diff --git a/tensorflow/g3doc/api_docs/cc/StructTF_Buffer.md b/tensorflow/g3doc/api_docs/cc/StructTF_Buffer.md
index c435db80298..084beffe66a 100644
--- a/tensorflow/g3doc/api_docs/cc/StructTF_Buffer.md
+++ b/tensorflow/g3doc/api_docs/cc/StructTF_Buffer.md
@@ -18,7 +18,7 @@
 
 
 
-#### `void(* TF_Buffer::data_deallocator) (void *data, size_t length))(void *data, size_t length)` {#void_TF_Buffer_data_deallocator_void_data_size_t_length_}
+#### `void(* TF_Buffer::data_deallocator)(void *data, size_t length))(void *data, size_t length)` {#void_TF_Buffer_data_deallocator_void_data_size_t_length_}
 
 
 
diff --git a/tensorflow/g3doc/api_docs/index.md b/tensorflow/g3doc/api_docs/index.md
index d074c0ece33..311908dca32 100644
--- a/tensorflow/g3doc/api_docs/index.md
+++ b/tensorflow/g3doc/api_docs/index.md
@@ -10,9 +10,9 @@ languages like Go, Java, JavaScript, Lua, R, and perhaps others. With
 [SWIG](http://swig.org), it's relatively easy to develop a TensorFlow interface
 for your favorite language.
 
-Note: Many practical aspects of usage are covered in the Mechanics tab, and
-some additional documentation not specific to any particular language API is
-available in the Resources tab.
+Note: Many practical aspects of usage are covered in the TUTORIALS and 
+HOW TO tab, and some additional documentation not specific to any 
+particular language API is available in the RESOURCES tab.
 
 * [Python API](python/index.md)
 * [C++ API](cc/index.md)
diff --git a/tensorflow/g3doc/api_docs/python/array_ops.md b/tensorflow/g3doc/api_docs/python/array_ops.md
index 7a1f82b2b02..9072a483c10 100644
--- a/tensorflow/g3doc/api_docs/python/array_ops.md
+++ b/tensorflow/g3doc/api_docs/python/array_ops.md
@@ -560,6 +560,94 @@ tf.slice(input, [1, 0, 0], [2, 1, 3]) ==> [[[3, 3, 3]],
   A `Tensor` the same type as `input`.
 
 
+- - -
+
+### `tf.strided_slice(input_, begin, end, strides, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0, name=None)` {#strided_slice}
+
+Extracts a strided slice from a tensor.
+
+To a first order, this operation extracts a slice of size `end - begin`
+from a tensor `input`
+starting at the location specified by `begin`. The slice continues by adding
+`stride` to the `begin` index until all dimensions are not less than `end`.
+Note that components of stride can be negative, which causes a reverse
+slice.
+
+This operation can be thought of an encoding of a numpy style sliced
+range. Given a python slice input[<spec0>, <spec1>, ..., <specn>]
+this function will be called as follows.
+
+`begin`, `end`, and `strides` will be all length n. n is in general
+not the same dimensionality as `input`.
+
+For the ith spec,
+`begin_mask`, `end_mask`, `ellipsis_mask`, `new_axis_mask`,
+and `shrink_axis_mask` will have the ith bit corrsponding to
+the ith spec.
+
+If the ith bit of `begin_mask` is non-zero, `begin[i]` is ignored and
+the fullest possible range in that dimension is used instead.
+`end_mask` works analogously, except with the end range.
+
+`foo[5:,:,:3]` on a 7x8x9 tensor is equivalent to `foo[5:7,0:8,0:3]`.
+`foo[::-1]` reverses a tensor with shape 8.
+
+
+If the ith bit of `ellipsis_mask`, as many unspecified dimensions
+as needed will be inserted between other dimensions. Only one
+non-zero bit is allowed in `ellipsis_mask`.
+
+For example `foo[3:5,...,4:5]` on a shape 10x3x3x10 tensor is
+equivalent to `foo[3:5,:,:,4:5]` and
+`foo[3:5,...]` is equivalent to `foo[3:5,:,:,:]`.
+
+If the ith bit of `new_axis_mask` is one, then a `begin`,
+`end`, and `stride` are ignored and a new length 1 dimension is
+added at this point in the output tensor.
+
+For example `foo[3:5,4]` on a 10x8 tensor produces a shape 2 tensor
+whereas `foo[3:5,4:5]` produces a shape 2x1 tensor with shrink_mask
+being 1<<1 == 2.
+
+If the ith bit of `shrink_axis_mask` is one, then `begin`,
+`end[i]`, and `stride[i]` are used to do a slice in the appropriate
+dimension, but the output tensor will be reduced in dimensionality
+by one. This is only valid if the ith entry of slice[i]==1.
+
+NOTE: `begin` and `end` are zero-indexed`.
+`strides` entries must be non-zero.
+
+
+```
+# 'input' is [[[1, 1, 1], [2, 2, 2]],
+#             [[3, 3, 3], [4, 4, 4]],
+#             [[5, 5, 5], [6, 6, 6]]]
+tf.slice(input, [1, 0, 0], [2, 1, 3], [1, 1, 1]) ==> [[[3, 3, 3]]]
+tf.slice(input, [1, 0, 0], [2, 2, 3], [1, 1, 1]) ==> [[[3, 3, 3],
+                                                       [4, 4, 4]]]
+tf.slice(input, [1, 1, 0], [2, -1, 3], [1, -1, 1]) ==>[[[4, 4, 4],
+                                                        [3, 3, 3]]]
+```
+
+##### Args:
+
+
+*  <b>`input_`</b>: A `Tensor`.
+*  <b>`begin`</b>: An `int32` or `int64` `Tensor`.
+*  <b>`end`</b>: An `int32` or `int64` `Tensor`.
+*  <b>`strides`</b>: An `int32` or `int64` `Tensor`.
+*  <b>`begin_mask`</b>: An `int32` mask.
+*  <b>`end_mask`</b>: An `int32` mask.
+*  <b>`ellipsis_mask`</b>: An `int32` mask.
+*  <b>`new_axis_mask`</b>: An `int32` mask.
+*  <b>`shrink_axis_mask`</b>: An `int32` mask.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor` the same type as `input`.
+
+
 - - -
 
 ### `tf.split(split_dim, num_split, value, name='split')` {#split}
@@ -1938,6 +2026,33 @@ endian orderings will give different results.
   A `Tensor` of type `type`.
 
 
+- - -
+
+### `tf.contrib.graph_editor.copy(sgv, dst_graph=None, dst_scope='', src_scope='')` {#copy}
+
+Copy a subgraph.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the source subgraph-view. This argument is converted to a subgraph
+    using the same rules than the function subgraph.make_view.
+*  <b>`dst_graph`</b>: the destination graph.
+*  <b>`dst_scope`</b>: the destination scope.
+*  <b>`src_scope`</b>: the source scope.
+
+##### Returns:
+
+  the subgraph view of the copied subgraph.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if dst_graph is not a tf.Graph.
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
+
 - - -
 
 ### `tf.shape_n(input, name=None)` {#shape_n}
diff --git a/tensorflow/g3doc/api_docs/python/check_ops.md b/tensorflow/g3doc/api_docs/python/check_ops.md
index 88463cf092f..b2b720421e8 100644
--- a/tensorflow/g3doc/api_docs/python/check_ops.md
+++ b/tensorflow/g3doc/api_docs/python/check_ops.md
@@ -7,7 +7,7 @@
 
 - - -
 
-### `tf.assert_negative(x, data=None, summarize=None, name=None)` {#assert_negative}
+### `tf.assert_negative(x, data=None, summarize=None, message=None, name=None)` {#assert_negative}
 
 Assert the condition `x < 0` holds element-wise.
 
@@ -34,6 +34,7 @@ If `x` is empty this is trivially satisfied.
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_negative".
 
 ##### Returns:
@@ -43,7 +44,7 @@ If `x` is empty this is trivially satisfied.
 
 - - -
 
-### `tf.assert_positive(x, data=None, summarize=None, name=None)` {#assert_positive}
+### `tf.assert_positive(x, data=None, summarize=None, message=None, name=None)` {#assert_positive}
 
 Assert the condition `x > 0` holds element-wise.
 
@@ -70,6 +71,7 @@ If `x` is empty this is trivially satisfied.
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_positive".
 
 ##### Returns:
@@ -100,7 +102,7 @@ Useful since `Tensor`, `ndarray`, byte/text type are all iterables themselves.
 
 - - -
 
-### `tf.assert_non_negative(x, data=None, summarize=None, name=None)` {#assert_non_negative}
+### `tf.assert_non_negative(x, data=None, summarize=None, message=None, name=None)` {#assert_non_negative}
 
 Assert the condition `x >= 0` holds element-wise.
 
@@ -127,6 +129,7 @@ If `x` is empty this is trivially satisfied.
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).
     Defaults to "assert_non_negative".
 
@@ -137,7 +140,7 @@ If `x` is empty this is trivially satisfied.
 
 - - -
 
-### `tf.assert_non_positive(x, data=None, summarize=None, name=None)` {#assert_non_positive}
+### `tf.assert_non_positive(x, data=None, summarize=None, message=None, name=None)` {#assert_non_positive}
 
 Assert the condition `x <= 0` holds element-wise.
 
@@ -164,6 +167,7 @@ If `x` is empty this is trivially satisfied.
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).
     Defaults to "assert_non_positive".
 
@@ -174,7 +178,7 @@ If `x` is empty this is trivially satisfied.
 
 - - -
 
-### `tf.assert_equal(x, y, data=None, summarize=None, name=None)` {#assert_equal}
+### `tf.assert_equal(x, y, data=None, summarize=None, message=None, name=None)` {#assert_equal}
 
 Assert the condition `x == y` holds element-wise.
 
@@ -203,6 +207,7 @@ If both `x` and `y` are empty, this is trivially satisfied.
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`, `y`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_equal".
 
 ##### Returns:
@@ -212,7 +217,7 @@ If both `x` and `y` are empty, this is trivially satisfied.
 
 - - -
 
-### `tf.assert_integer(x, data=None, summarize=None, name=None)` {#assert_integer}
+### `tf.assert_integer(x, message=None, name=None)` {#assert_integer}
 
 Assert that `x` is of integer dtype.
 
@@ -233,19 +238,22 @@ x = tf.with_dependencies([tf.assert_integer(x)], x)
 
 
 *  <b>`x`</b>: `Tensor` whose basetype is integer and is not quantized.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_integer".
 
+##### Raises:
+
+
+*  <b>`TypeError`</b>: If `x.dtype` is anything other than non-quantized integer.
+
 ##### Returns:
 
-  Op that raises `InvalidArgumentError` if `x == y` is False.
+  A `no_op` that does nothing.  Type can be determined statically.
 
 
 - - -
 
-### `tf.assert_less(x, y, data=None, summarize=None, name=None)` {#assert_less}
+### `tf.assert_less(x, y, data=None, summarize=None, message=None, name=None)` {#assert_less}
 
 Assert the condition `x < y` holds element-wise.
 
@@ -274,6 +282,7 @@ If both `x` and `y` are empty, this is trivially satisfied.
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`, `y`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_less".
 
 ##### Returns:
@@ -283,7 +292,7 @@ If both `x` and `y` are empty, this is trivially satisfied.
 
 - - -
 
-### `tf.assert_less_equal(x, y, data=None, summarize=None, name=None)` {#assert_less_equal}
+### `tf.assert_less_equal(x, y, data=None, summarize=None, message=None, name=None)` {#assert_less_equal}
 
 Assert the condition `x <= y` holds element-wise.
 
@@ -312,6 +321,7 @@ If both `x` and `y` are empty, this is trivially satisfied.
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`, `y`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_less_equal"
 
 ##### Returns:
@@ -321,7 +331,7 @@ If both `x` and `y` are empty, this is trivially satisfied.
 
 - - -
 
-### `tf.assert_rank(x, rank, data=None, summarize=None, name=None)` {#assert_rank}
+### `tf.assert_rank(x, rank, data=None, summarize=None, message=None, name=None)` {#assert_rank}
 
 Assert `x` has rank equal to `rank`.
 
@@ -346,11 +356,13 @@ x = tf.with_dependencies([tf.assert_rank(x, 2)], x)
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_rank".
 
 ##### Returns:
 
   Op raising `InvalidArgumentError` unless `x` has specified rank.
+  If static checks determine `x` has correct rank, a `no_op` is returned.
 
 ##### Raises:
 
@@ -360,7 +372,7 @@ x = tf.with_dependencies([tf.assert_rank(x, 2)], x)
 
 - - -
 
-### `tf.assert_rank_at_least(x, rank, data=None, summarize=None, name=None)` {#assert_rank_at_least}
+### `tf.assert_rank_at_least(x, rank, data=None, summarize=None, message=None, name=None)` {#assert_rank_at_least}
 
 Assert `x` has rank equal to `rank` or higher.
 
@@ -385,12 +397,14 @@ x = tf.with_dependencies([tf.assert_rank_at_least(x, 2)], x)
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).
     Defaults to "assert_rank_at_least".
 
 ##### Returns:
 
   Op raising `InvalidArgumentError` unless `x` has specified rank or higher.
+  If static checks determine `x` has correct rank, a `no_op` is returned.
 
 ##### Raises:
 
@@ -400,20 +414,26 @@ x = tf.with_dependencies([tf.assert_rank_at_least(x, 2)], x)
 
 - - -
 
-### `tf.assert_type(tensor, tf_type)` {#assert_type}
+### `tf.assert_type(tensor, tf_type, message=None, name=None)` {#assert_type}
 
-Asserts that the given `Tensor` is of the specified type.
+Statically asserts that the given `Tensor` is of the specified type.
 
 ##### Args:
 
 
 *  <b>`tensor`</b>: A tensorflow `Tensor`.
 *  <b>`tf_type`</b>: A tensorflow type (dtypes.float32, tf.int64, dtypes.bool, etc).
+*  <b>`message`</b>: A string to prefix to the default message.
+*  <b>`name`</b>: A name to give this `Op`.  Defaults to "assert_type"
 
 ##### Raises:
 
 
-*  <b>`ValueError`</b>: If the tensors data type doesn't match tf_type.
+*  <b>`TypeError`</b>: If the tensors data type doesn't match tf_type.
+
+##### Returns:
+
+  A `no_op` that does nothing.  Type can be determined statically.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/client.md b/tensorflow/g3doc/api_docs/python/client.md
index 825fc6b16b3..1490bbaec70 100644
--- a/tensorflow/g3doc/api_docs/python/client.md
+++ b/tensorflow/g3doc/api_docs/python/client.md
@@ -96,32 +96,55 @@ the session constructor.
 
 #### `tf.Session.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#Session.run}
 
-Runs the operations and evaluates the tensors in `fetches`.
+Runs operations and evaluates tensors in `fetches`.
 
 This method runs one "step" of TensorFlow computation, by
 running the necessary graph fragment to execute every `Operation`
 and evaluate every `Tensor` in `fetches`, substituting the values in
 `feed_dict` for the corresponding input values.
 
-The `fetches` argument may be a single graph element, an arbitrarily nested
-list of graph elements, or a dictionary whose values are the above. The type
-of `fetches` determines the return value of this method. A graph element can
-be one of the following types:
+The `fetches` argument may be a single graph element, or an arbitrarily
+nested list, tuple, namedtuple, or dict containing graph elements at its
+leaves.  A graph element can be one of the following types:
 
-* If an element of `fetches` is an
-  [`Operation`](../../api_docs/python/framework.md#Operation), the
-  corresponding fetched value will be `None`.
-* If an element of `fetches` is a
-  [`Tensor`](../../api_docs/python/framework.md#Tensor), the corresponding
-  fetched value will be a numpy ndarray containing the value of that tensor.
-* If an element of `fetches` is a
-  [`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor),
-  the corresponding fetched value will be a
+* An [`Operation`](../../api_docs/python/framework.md#Operation).
+  The corresponding fetched value will be `None`.
+* A [`Tensor`](../../api_docs/python/framework.md#Tensor).
+  The corresponding fetched value will be a numpy ndarray containing the
+  value of that tensor.
+* A [`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor).
+  The corresponding fetched value will be a
   [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue)
   containing the value of that sparse tensor.
-* If an element of `fetches` is produced by a `get_tensor_handle` op,
-  the corresponding fetched value will be a numpy ndarray containing the
-  handle of that tensor.
+* A `get_tensor_handle` op.  The corresponding fetched value will be a
+  numpy ndarray containing the handle of that tensor.
+* A `string` which is the name of a tensor or operation in the graph.
+
+The value returned by `run()` has the same shape as the `fetches` argument,
+where the leaves are replaced by the corresponding values returned by
+TensorFlow.
+
+Example:
+
+```python
+   a = tf.constant([10, 20])
+   b = tf.constant([1.0, 2.0])
+   # 'fetches' can be a singleton
+   v = session.run(a)
+   # v is the numpy array [10, 20]
+   # 'fetches' can be a list.
+   v = session.run([a, b])
+   # v a Python list with 2 numpy arrays: the numpy array [10, 20] and the
+   # 1-D array [1.0, 2.0]
+   # 'fetches' can be arbitrary lists, tuples, namedtuple, dicts:
+   MyData = collections.namedtuple('MyData', ['a', 'b'])
+   v = session.run({'k1': MyData(a, b), 'k2': [b, a]})
+   # v is a dict with
+   # v['k1'] is a MyData namedtuple with 'a' the numpy array [10, 20] and
+   # 'b' the numpy array [1.0, 2.0]
+   # v['k2'] is a list with the numpy array [1.0, 2.0] and the numpy array
+   # [10, 20].
+```
 
 The optional `feed_dict` argument allows the caller to override
 the value of tensors in the graph. Each key in `feed_dict` can be
@@ -350,8 +373,7 @@ the session constructor.
 
 
 *  <b>`target`</b>: (Optional.) The execution engine to connect to.
-    Defaults to using an in-process engine. At present, no value
-    other than the empty string is supported.
+    Defaults to using an in-process engine.
 *  <b>`graph`</b>: (Optional.) The `Graph` to be launched (described above).
 *  <b>`config`</b>: (Optional) `ConfigProto` proto used to configure the session.
 
diff --git a/tensorflow/g3doc/api_docs/python/constant_op.md b/tensorflow/g3doc/api_docs/python/constant_op.md
index d5803f925b2..50bcac8506a 100644
--- a/tensorflow/g3doc/api_docs/python/constant_op.md
+++ b/tensorflow/g3doc/api_docs/python/constant_op.md
@@ -684,3 +684,38 @@ with tf.Session() as sess2:
 *  <b>`seed`</b>: integer.
 
 
+
+## Other Functions and Classes
+- - -
+
+### `tf.contrib.graph_editor.ops(*args, **kwargs)` {#ops}
+
+Helper to select operations.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Operation. tf.Tensor instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ops_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ops)".
+
+##### Returns:
+
+  list of tf.Operation
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Operation
+    or an (array of) tf.Tensor (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/contrib.distributions.md b/tensorflow/g3doc/api_docs/python/contrib.distributions.md
index 289dc497737..78a0fb390e9 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.distributions.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.distributions.md
@@ -352,6 +352,412 @@ Variance of the distribution.
 
 ### Univariate (scalar) distributions
 
+- - -
+
+### `class tf.contrib.distributions.Binomial` {#Binomial}
+
+Binomial distribution.
+
+This distribution is parameterized by a vector `p` of probabilities and `n`,
+the total counts.
+
+#### Mathematical details
+
+The Binomial is a distribution over the number of successes in `n` independent
+trials, with each trial having the same probability of success `p`.
+The probability mass function (pmf):
+
+```pmf(k) = n! / (k! * (n - k)!) * (p)^k * (1 - p)^(n - k)```
+
+#### Examples
+
+Create a single distribution, corresponding to 5 coin flips.
+
+```python
+dist = Binomial(n=5., p=.5)
+```
+
+Create a single distribution (using logits), corresponding to 5 coin flips.
+
+```python
+dist = Binomial(n=5., logits=0.)
+```
+
+Creates 3 distributions with the third distribution most likely to have
+successes.
+
+```python
+p = [.2, .3, .8]
+# n will be broadcast to [4., 4., 4.], to match p.
+dist = Binomial(n=4., p=p)
+```
+
+The distribution functions can be evaluated on counts.
+
+```python
+# counts same shape as p.
+counts = [1., 2, 3]
+dist.prob(counts)  # Shape [3]
+
+# p will be broadcast to [[.2, .3, .8], [.2, .3, .8]] to match counts.
+counts = [[1., 2, 1], [2, 2, 4]]
+dist.prob(counts)  # Shape [2, 3]
+
+# p will be broadcast to shape [5, 7, 3] to match counts.
+counts = [[...]]  # Shape [5, 7, 3]
+dist.prob(counts)  # Shape [5, 7, 3]
+```
+- - -
+
+#### `tf.contrib.distributions.Binomial.__init__(n, logits=None, p=None, validate_args=True, allow_nan_stats=False, name='Binomial')` {#Binomial.__init__}
+
+Initialize a batch of Binomial distributions.
+
+##### Args:
+
+
+*  <b>`n`</b>: Non-negative floating point tensor with shape broadcastable to
+    `[N1,..., Nm]` with `m >= 0` and the same dtype as `p` or `logits`.
+    Defines this as a batch of `N1 x ... x Nm` different Binomial
+    distributions. Its components should be equal to integer values.
+*  <b>`logits`</b>: Floating point tensor representing the log-odds of a
+    positive event with shape broadcastable to `[N1,..., Nm]` `m >= 0`, and
+    the same dtype as `n`. Each entry represents logits for the probability
+    of success for independent Binomial distributions.
+*  <b>`p`</b>: Positive floating point tensor with shape broadcastable to
+    `[N1,..., Nm]` `m >= 0`, `p in [0, 1]`. Each entry represents the
+    probability of success for independent Binomial distributions.
+*  <b>`validate_args`</b>: Whether to assert valid values for parameters `n` and `p`,
+    and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
+    guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
+*  <b>`name`</b>: The name to prefix Ops created by this distribution class.
+
+
+*  <b>`Examples`</b>: 
+
+```python
+# Define 1-batch of a binomial distribution.
+dist = Binomial(n=2., p=.9)
+
+# Define a 2-batch.
+dist = Binomial(n=[4., 5], p=[.1, .3])
+```
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.allow_nan_stats` {#Binomial.allow_nan_stats}
+
+Boolean describing behavior when a stat is undefined for batch member.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.batch_shape(name='batch_shape')` {#Binomial.batch_shape}
+
+Batch dimensions of this instance as a 1-D int32 `Tensor`.
+
+The product of the dimensions of the `batch_shape` is the number of
+independent distributions of this kind the instance represents.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `batch_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.cdf(value, name='cdf')` {#Binomial.cdf}
+
+Cumulative distribution function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.dtype` {#Binomial.dtype}
+
+dtype of samples from this distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.entropy(name='entropy')` {#Binomial.entropy}
+
+Entropy of the distribution in nats.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.event_shape(name='event_shape')` {#Binomial.event_shape}
+
+Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `event_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.get_batch_shape()` {#Binomial.get_batch_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `batch_shape`. May be only partially defined.
+
+##### Returns:
+
+  batch shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.get_event_shape()` {#Binomial.get_event_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `event_shape`. May be only partially defined.
+
+##### Returns:
+
+  event shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.is_continuous` {#Binomial.is_continuous}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.is_reparameterized` {#Binomial.is_reparameterized}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_cdf(value, name='log_cdf')` {#Binomial.log_cdf}
+
+Log CDF.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_pdf(value, name='log_pdf')` {#Binomial.log_pdf}
+
+Log of the probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_pmf(value, name='log_pmf')` {#Binomial.log_pmf}
+
+Log of the probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_prob(counts, name='log_prob')` {#Binomial.log_prob}
+
+`Log(P[counts])`, computed for every batch member.
+
+For each batch member of counts `k`, `P[counts]` is the probability that
+after sampling `n` draws from this Binomial distribution, the number of
+successes is `k`.  Note that different sequences of draws can result in the
+same counts, thus the probability includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.p` and `self.n`. `counts` is only legal if it is
+    less than or equal to `n` and its components are equal to integer
+    values.
+*  <b>`name`</b>: Name to give this Op, defaults to "log_prob".
+
+##### Returns:
+
+  Log probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.logits` {#Binomial.logits}
+
+Log-odds.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.mean(name='mean')` {#Binomial.mean}
+
+Mean of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.mode(name='mode')` {#Binomial.mode}
+
+Mode of the distribution.
+
+Note that when `(n + 1) * p` is an integer, there are actually two modes.
+Namely, `(n + 1) * p` and `(n + 1) * p - 1` are both modes. Here we return
+only the larger of the two modes.
+
+##### Args:
+
+
+*  <b>`name`</b>: The name for this op.
+
+##### Returns:
+
+  The mode of the Binomial distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.n` {#Binomial.n}
+
+Number of trials.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.name` {#Binomial.name}
+
+Name to prepend to all ops.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.p` {#Binomial.p}
+
+Probability of success.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.pdf(value, name='pdf')` {#Binomial.pdf}
+
+The probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.pmf(value, name='pmf')` {#Binomial.pmf}
+
+The probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.prob(counts, name='prob')` {#Binomial.prob}
+
+`P[counts]`, computed for every batch member.
+
+
+For each batch member of counts `k`, `P[counts]` is the probability that
+after sampling `n` draws from this Binomial distribution, the number of
+successes is `k`.  Note that different sequences of draws can result in the
+same counts, thus the probability includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.p` and `self.n`. `counts` is only legal if it is
+    less than or equal to `n` and its components are equal to integer
+    values.
+*  <b>`name`</b>: Name to give this Op, defaults to "prob".
+
+##### Returns:
+
+  Probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.sample(sample_shape=(), seed=None, name='sample')` {#Binomial.sample}
+
+Generate samples of the specified shape for each batched distribution.
+
+Note that a call to `sample()` without arguments will generate a single
+sample per batched distribution.
+
+##### Args:
+
+
+*  <b>`sample_shape`</b>: `int32` `Tensor` or tuple or list. Shape of the generated
+    samples.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of dtype `self.dtype` and shape
+      `sample_shape + self.batch_shape + self.event_shape`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.sample_n(n, seed=None, name='sample_n')` {#Binomial.sample_n}
+
+Generate `n` samples.
+
+##### Args:
+
+
+*  <b>`n`</b>: scalar. Number of samples to draw from each distribution.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of shape `(n,) + self.batch_shape + self.event_shape`
+      with values of type `self.dtype`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.std(name='std')` {#Binomial.std}
+
+Standard deviation of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.validate_args` {#Binomial.validate_args}
+
+Boolean describing behavior on invalid input.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.variance(name='variance')` {#Binomial.variance}
+
+Variance of the distribution.
+
+
+
 - - -
 
 ### `class tf.contrib.distributions.Bernoulli` {#Bernoulli}
@@ -360,10 +766,6 @@ Bernoulli distribution.
 
 The Bernoulli distribution is parameterized by p, the probability of a
 positive event.
-
-Note, the following methods of the base class aren't implemented:
-  * cdf
-  * log_cdf
 - - -
 
 #### `tf.contrib.distributions.Bernoulli.__init__(logits=None, p=None, dtype=tf.int32, validate_args=True, allow_nan_stats=False, name='Bernoulli')` {#Bernoulli.__init__}
@@ -383,10 +785,10 @@ Construct Bernoulli distributions.
 *  <b>`dtype`</b>: dtype for samples.
 *  <b>`validate_args`</b>: Whether to assert that `0 <= p <= 1`. If not validate_args,
    `log_pmf` may return nans.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: A name for this distribution.
 
 ##### Raises:
@@ -767,20 +1169,20 @@ Initialize a batch of Beta distributions.
 ##### Args:
 
 
-*  <b>`a`</b>: Positive `float` or `double` tensor with shape broadcastable to
+*  <b>`a`</b>: Positive floating point tensor with shape broadcastable to
     `[N1,..., Nm]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
      different Beta distributions. This also defines the
      dtype of the distribution.
-*  <b>`b`</b>: Positive `float` or `double` tensor with shape broadcastable to
+*  <b>`b`</b>: Positive floating point tensor with shape broadcastable to
     `[N1,..., Nm]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
      different Beta distributions.
 *  <b>`validate_args`</b>: Whether to assert valid values for parameters `a` and `b`,
-    and `x` in `prob` and `log_prob`.  If False, correct behavior is not
+    and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
     guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 
@@ -942,7 +1344,7 @@ Log of the probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float` or `double`, tensor whose shape can
+*  <b>`x`</b>: Non-negative floating point tensor whose shape can
     be broadcast with `self.a` and `self.b`.  For fixed leading
     dimensions, the last dimension represents counts for the corresponding
     Beta distribution in `self.a` and `self.b`. `x` is only legal if
@@ -1012,7 +1414,7 @@ The probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float`, `double` tensor whose shape can
+*  <b>`x`</b>: Non-negative floating point tensor whose shape can
     be broadcast with `self.a` and `self.b`.  For fixed leading
     dimensions, the last dimension represents x for the corresponding Beta
     distribution in `self.a` and `self.b`. `x` is only legal if is
@@ -1098,11 +1500,6 @@ Categorical distribution.
 
 The categorical distribution is parameterized by the log-probabilities
 of a set of classes.
-
-Note, the following methods of the base class aren't implemented:
-  * mean
-  * cdf
-  * log_cdf
 - - -
 
 #### `tf.contrib.distributions.Categorical.__init__(logits, dtype=tf.int32, validate_args=True, allow_nan_stats=False, name='Categorical')` {#Categorical.__init__}
@@ -1118,10 +1515,10 @@ Initialize Categorical distributions using class log-probabilities.
       indexes into the classes.
 *  <b>`dtype`</b>: The type of the event samples (default: int32).
 *  <b>`validate_args`</b>: Unused in this distribution.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: A name for this distribution (optional).
 
 
@@ -1385,15 +1782,15 @@ Construct Chi2 distributions with parameter `df`.
 ##### Args:
 
 
-*  <b>`df`</b>: `float` or `double` tensor, the degrees of freedom of the
+*  <b>`df`</b>: Floating point tensor, the degrees of freedom of the
     distribution(s).  `df` must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `df > 0`, and that `x > 0` in the
-    methods `prob(x)` and `log_prob(x)`. If `validate_args` is False
+    methods `prob(x)` and `log_prob(x)`. If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 
@@ -1767,15 +2164,15 @@ Construct Exponential distribution with parameter `lam`.
 ##### Args:
 
 
-*  <b>`lam`</b>: `float` or `double` tensor, the rate of the distribution(s).
+*  <b>`lam`</b>: Floating point tensor, the rate of the distribution(s).
     `lam` must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `lam > 0`, and that `x > 0` in the
-    methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+    methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member. If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 
@@ -2161,19 +2558,19 @@ broadcasting (e.g. `alpha + beta` is a valid operation).
 ##### Args:
 
 
-*  <b>`alpha`</b>: `float` or `double` tensor, the shape params of the
+*  <b>`alpha`</b>: Floating point tensor, the shape params of the
     distribution(s).
     alpha must contain only positive values.
-*  <b>`beta`</b>: `float` or `double` tensor, the inverse scale params of the
+*  <b>`beta`</b>: Floating point tensor, the inverse scale params of the
     distribution(s).
     beta must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `a > 0, b > 0`, and that `x > 0` in
-    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 ##### Raises:
@@ -2560,18 +2957,18 @@ broadcasting (e.g. `alpha + beta` is a valid operation).
 ##### Args:
 
 
-*  <b>`alpha`</b>: `float` or `double` tensor, the shape params of the
+*  <b>`alpha`</b>: Floating point tensor, the shape params of the
     distribution(s).
     alpha must contain only positive values.
-*  <b>`beta`</b>: `float` or `double` tensor, the scale params of the distribution(s).
+*  <b>`beta`</b>: Floating point tensor, the scale params of the distribution(s).
     beta must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `a > 0, b > 0`, and that `x > 0` in
-    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 ##### Raises:
@@ -2972,17 +3369,17 @@ broadcasting (e.g., `loc / scale` is a valid operation).
 ##### Args:
 
 
-*  <b>`loc`</b>: `float` or `double` tensor which characterizes the location (center)
+*  <b>`loc`</b>: Floating point tensor which characterizes the location (center)
     of the distribution.
-*  <b>`scale`</b>: `float` or `double`, positive-valued tensor which characterzes the
-    spread of the distribution.
+*  <b>`scale`</b>: Positive floating point tensor which characterizes the spread of
+    the distribution.
 *  <b>`validate_args`</b>: Whether to validate input with asserts.  If `validate_args`
     is `False`, and the inputs are invalid, correct behavior is not
     guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
@@ -3363,15 +3760,15 @@ broadcasting (e.g. `mu + sigma` is a valid operation).
 ##### Args:
 
 
-*  <b>`mu`</b>: `float` or `double` tensor, the means of the distribution(s).
-*  <b>`sigma`</b>: `float` or `double` tensor, the stddevs of the distribution(s).
+*  <b>`mu`</b>: Floating point tensor, the means of the distribution(s).
+*  <b>`sigma`</b>: Floating point tensor, the stddevs of the distribution(s).
     sigma must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `sigma > 0`. If `validate_args` is
-    False, correct output is not guaranteed when input is invalid.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+    `False`, correct output is not guaranteed when input is invalid.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
@@ -3750,19 +4147,19 @@ broadcasting (e.g. `df + mu + sigma` is a valid operation).
 ##### Args:
 
 
-*  <b>`df`</b>: `float` or `double` tensor, the degrees of freedom of the
+*  <b>`df`</b>: Floating point tensor, the degrees of freedom of the
     distribution(s). `df` must contain only positive values.
-*  <b>`mu`</b>: `float` or `double` tensor, the means of the distribution(s).
-*  <b>`sigma`</b>: `float` or `double` tensor, the scaling factor for the
+*  <b>`mu`</b>: Floating point tensor, the means of the distribution(s).
+*  <b>`sigma`</b>: Floating point tensor, the scaling factor for the
     distribution(s). `sigma` must contain only positive values.
     Note that `sigma` is not the standard deviation of this distribution.
 *  <b>`validate_args`</b>: Whether to assert that `df > 0, sigma > 0`. If
-    `validate_args` is False and inputs are invalid, correct behavior is not
-    guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+    `validate_args` is `False` and inputs are invalid, correct behavior is
+    not guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
@@ -4102,14 +4499,14 @@ u1 = Uniform(3.0, [5.0, 6.0, 7.0])  # 3 distributions
 ##### Args:
 
 
-*  <b>`a`</b>: `float` or `double` tensor, the minimum endpoint.
-*  <b>`b`</b>: `float` or `double` tensor, the maximum endpoint. Must be > `a`.
-*  <b>`validate_args`</b>: Whether to assert that `a > b`. If `validate_args` is False
-    and inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`a`</b>: Floating point tensor, the minimum endpoint.
+*  <b>`b`</b>: Floating point tensor, the maximum endpoint. Must be > `a`.
+*  <b>`validate_args`</b>: Whether to assert that `a > b`. If `validate_args` is
+    `False` and inputs are invalid, correct behavior is not guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 ##### Raises:
@@ -4403,7 +4800,7 @@ The PDF of this distribution is defined in terms of the diagonal covariance
 determined by `diag_stdev`: `C_{ii} = diag_stdev[i]**2`.
 
 ```
-f(x) = (2*pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 * (x - mu)^T C^{-1} (x - mu))
+f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
 ```
 
 #### Examples
@@ -4446,17 +4843,17 @@ The mean of `X_i` is `mu[i]`, and the standard deviation is `diag_stdev[i]`.
 ##### Args:
 
 
-*  <b>`mu`</b>: Rank `N + 1` `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+*  <b>`mu`</b>: Rank `N + 1` floating point tensor with shape `[N1,...,Nb, k]`,
     `b >= 0`.
 *  <b>`diag_stdev`</b>: Rank `N + 1` `Tensor` with same `dtype` and shape as `mu`,
-    representing the standard deviations.
+    representing the standard deviations.  Must be positive.
 *  <b>`validate_args`</b>: Whether to validate input with asserts.  If `validate_args`
     is `False`,
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: `Boolean`, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
@@ -4469,7 +4866,7 @@ The mean of `X_i` is `mu[i]`, and the standard deviation is `diag_stdev[i]`.
 
 #### `tf.contrib.distributions.MultivariateNormalDiag.allow_nan_stats` {#MultivariateNormalDiag.allow_nan_stats}
 
-Boolean describing behavior when a stat is undefined for batch member.
+`Boolean` describing behavior when stats are undefined.
 
 
 - - -
@@ -4738,7 +5135,7 @@ Standard deviation of the distribution.
 
 #### `tf.contrib.distributions.MultivariateNormalDiag.validate_args` {#MultivariateNormalDiag.validate_args}
 
-Boolean describing behavior on invalid input.
+`Boolean` describing behavior on invalid input.
 
 
 - - -
@@ -4760,14 +5157,12 @@ Evaluation of the pdf, determinant, and sampling are all `O(k^3)` operations.
 
 #### Mathematical details
 
-The PDF of this distribution is:
+With `C = sigma`, the PDF of this distribution is:
 
 ```
-f(x) = (2*pi)^(-k/2) |det(sigma)|^(-1/2) exp(-1/2*(x-mu)^*.sigma^{-1}.(x-mu))
+f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
 ```
 
-where `.` denotes the inner product on `R^k` and `^*` denotes transpose.
-
 #### Examples
 
 A single multi-variate Gaussian distribution is defined by a vector of means
@@ -4805,17 +5200,17 @@ User must provide means `mu` and `sigma`, the mean and covariance.
 ##### Args:
 
 
-*  <b>`mu`</b>: `(N+1)-D`  `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+*  <b>`mu`</b>: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
     `b >= 0`.
 *  <b>`sigma`</b>: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
-    `[N1,...,Nb, k, k]`.
+    `[N1,...,Nb, k, k]`.  Each batch member must be positive definite.
 *  <b>`validate_args`</b>: Whether to validate input with asserts.  If `validate_args`
     is `False`, and the inputs are invalid, correct behavior is not
     guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: `Boolean`, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
@@ -4828,7 +5223,7 @@ User must provide means `mu` and `sigma`, the mean and covariance.
 
 #### `tf.contrib.distributions.MultivariateNormalFull.allow_nan_stats` {#MultivariateNormalFull.allow_nan_stats}
 
-Boolean describing behavior when a stat is undefined for batch member.
+`Boolean` describing behavior when stats are undefined.
 
 
 - - -
@@ -5097,7 +5492,7 @@ Standard deviation of the distribution.
 
 #### `tf.contrib.distributions.MultivariateNormalFull.validate_args` {#MultivariateNormalFull.validate_args}
 
-Boolean describing behavior on invalid input.
+`Boolean` describing behavior on invalid input.
 
 
 - - -
@@ -5120,14 +5515,14 @@ and requires `O(k^2)` storage.
 
 #### Mathematical details
 
-The PDF of this distribution is:
+The Cholesky factor `chol` defines the covariance matrix: `C = chol chol^T`.
+
+The PDF of this distribution is then:
 
 ```
-f(x) = (2*pi)^(-k/2) |det(sigma)|^(-1/2) exp(-1/2*(x-mu)^*.sigma^{-1}.(x-mu))
+f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
 ```
 
-where `.` denotes the inner product on `R^k` and `^*` denotes transpose.
-
 #### Examples
 
 A single multi-variate Gaussian distribution is defined by a vector of means
@@ -5165,22 +5560,23 @@ Trainable (batch) Choesky matrices can be created with
 Multivariate Normal distributions on `R^k`.
 
 User must provide means `mu` and `chol` which holds the (batch) Cholesky
-factors `S`, such that the covariance of each batch member is `S S^*`.
+factors, such that the covariance of each batch member is `chol chol^T`.
 
 ##### Args:
 
 
-*  <b>`mu`</b>: `(N+1)-D`  `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+*  <b>`mu`</b>: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
     `b >= 0`.
 *  <b>`chol`</b>: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
-    `[N1,...,Nb, k, k]`.
+    `[N1,...,Nb, k, k]`.  The upper triangular part is ignored (treated as
+    though it is zero), and the diagonal must be positive.
 *  <b>`validate_args`</b>: Whether to validate input with asserts.  If `validate_args`
-    is `False`,
-    and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+    is `False`, and the inputs are invalid, correct behavior is not
+    guaranteed.
+*  <b>`allow_nan_stats`</b>: `Boolean`, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
@@ -5193,7 +5589,7 @@ factors `S`, such that the covariance of each batch member is `S S^*`.
 
 #### `tf.contrib.distributions.MultivariateNormalCholesky.allow_nan_stats` {#MultivariateNormalCholesky.allow_nan_stats}
 
-Boolean describing behavior when a stat is undefined for batch member.
+`Boolean` describing behavior when stats are undefined.
 
 
 - - -
@@ -5462,7 +5858,7 @@ Standard deviation of the distribution.
 
 #### `tf.contrib.distributions.MultivariateNormalCholesky.validate_args` {#MultivariateNormalCholesky.validate_args}
 
-Boolean describing behavior on invalid input.
+`Boolean` describing behavior on invalid input.
 
 
 - - -
@@ -5606,16 +6002,16 @@ Initialize a batch of Dirichlet distributions.
 ##### Args:
 
 
-*  <b>`alpha`</b>: Positive `float` or `double` tensor with shape broadcastable to
+*  <b>`alpha`</b>: Positive floating point tensor with shape broadcastable to
     `[N1,..., Nm, k]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
      different `k` class Dirichlet distributions.
 *  <b>`validate_args`</b>: Whether to assert valid values for parameters `alpha` and
-    `x` in `prob` and `log_prob`.  If False, correct behavior is not
+    `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
     guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 
@@ -5771,7 +6167,7 @@ Log of the probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float` or `double`, tensor whose shape can
+*  <b>`x`</b>: Non-negative tensor with dtype `dtype` and whose shape can
     be broadcast with `self.alpha`.  For fixed leading dimensions, the last
     dimension represents counts for the corresponding Dirichlet distribution
     in `self.alpha`. `x` is only legal if it sums up to one.
@@ -5840,7 +6236,7 @@ The probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float`, `double` tensor whose shape can
+*  <b>`x`</b>: Non-negative tensor with dtype `dtype` and whose shape can
     be broadcast with `self.alpha`.  For fixed leading dimensions, the last
     dimension represents x for the corresponding Dirichlet distribution in
     `self.alpha` and `self.beta`. `x` is only legal if it sums up to one.
@@ -5990,32 +6386,29 @@ dist.pmf(counts)  # Shape [2]
 ```
 - - -
 
-#### `tf.contrib.distributions.DirichletMultinomial.__init__(n, alpha, allow_arbitrary_counts=False, validate_args=True, allow_nan_stats=False, name='DirichletMultinomial')` {#DirichletMultinomial.__init__}
+#### `tf.contrib.distributions.DirichletMultinomial.__init__(n, alpha, validate_args=True, allow_nan_stats=False, name='DirichletMultinomial')` {#DirichletMultinomial.__init__}
 
 Initialize a batch of DirichletMultinomial distributions.
 
 ##### Args:
 
 
-*  <b>`n`</b>: Non-negative `float` or `double` tensor with shape
-    broadcastable to `[N1,..., Nm]` with `m >= 0`.  Defines this as a batch
-    of `N1 x ... x Nm` different Dirichlet multinomial distributions. Its
-    components should be equal to integral values.
-*  <b>`alpha`</b>: Positive `float` or `double` tensor with shape broadcastable to
-    `[N1,..., Nm, k]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
-     different `k` class Dirichlet multinomial distributions.
-*  <b>`allow_arbitrary_counts`</b>: Boolean. This represents whether the pmf/cdf
-    allows for the `counts` tensor to be non-integral values.
-    The pmf/cdf are functions that can be evaluated at non-integral values,
-    but are only a distribution over non-negative integers.  If
-    `validate_args` is `False`, this assertion is turned off.
+*  <b>`n`</b>: Non-negative floating point tensor, whose dtype is the same as
+    `alpha`. The shape is broadcastable to `[N1,..., Nm]` with `m >= 0`.
+    Defines this as a batch of `N1 x ... x Nm` different Dirichlet
+    multinomial distributions. Its components should be equal to integer
+    values.
+*  <b>`alpha`</b>: Positive floating point tensor, whose dtype is the same as
+    `n` with shape broadcastable to `[N1,..., Nm, k]` `m >= 0`.  Defines
+    this as a batch of `N1 x ... x Nm` different `k` class Dirichlet
+    multinomial distributions.
 *  <b>`validate_args`</b>: Whether to assert valid values for parameters `alpha` and
-    `n`, and `x` in `prob` and `log_prob`.  If False, correct behavior is
+    `n`, and `x` in `prob` and `log_prob`.  If `False`, correct behavior is
     not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 
@@ -6177,12 +6570,11 @@ probability includes a combinatorial coefficient.
 ##### Args:
 
 
-*  <b>`counts`</b>: Non-negative `float` or `double` tensor whose shape can
-    be broadcast with `self.alpha`.  For fixed leading dimensions, the last
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.alpha`.  For fixed leading dimensions, the last
     dimension represents counts for the corresponding Dirichlet Multinomial
     distribution in `self.alpha`. `counts` is only legal if it sums up to
-    `n` and its components are equal to integral values. The second
-    condition is relaxed if `allow_arbitrary_counts` is set.
+    `n` and its components are equal to integer values.
 *  <b>`name`</b>: Name to give this Op, defaults to "log_prob".
 
 ##### Returns:
@@ -6247,12 +6639,11 @@ probability includes a combinatorial coefficient.
 ##### Args:
 
 
-*  <b>`counts`</b>: Non-negative `float`, `double` tensor whose shape can
-    be broadcast with `self.alpha`.  For fixed leading dimensions, the last
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.alpha`.  For fixed leading dimensions, the last
     dimension represents counts for the corresponding Dirichlet Multinomial
     distribution in `self.alpha`. `counts` is only legal if it sums up to
-    `n` and its components are equal to integral values. The second
-    condition is relaxed if `allow_arbitrary_counts` is set.
+    `n` and its components are equal to integer values.
 *  <b>`name`</b>: Name to give this Op, defaults to "prob".
 
 ##### Returns:
@@ -6351,6 +6742,413 @@ Cov(X_i, X_j) = -n * alpha_i * alpha_j / alpha_0 ** 2 *
 
 
 
+- - -
+
+### `class tf.contrib.distributions.Multinomial` {#Multinomial}
+
+Multinomial distribution.
+
+This distribution is parameterized by a vector `p` of probability
+parameters for `k` classes and `n`, the counts per each class..
+
+#### Mathematical details
+
+The Multinomial is a distribution over k-class count data, meaning
+for each k-tuple of non-negative integer `counts = [n_1,...,n_k]`, we have a
+probability of these draws being made from the distribution.  The distribution
+has hyperparameters `p = (p_1,...,p_k)`, and probability mass
+function (pmf):
+
+```pmf(counts) = n! / (n_1!...n_k!) * (p_1)^n_1*(p_2)^n_2*...(p_k)^n_k```
+
+where above `n = sum_j n_j`, `n!` is `n` factorial.
+
+#### Examples
+
+Create a 3-class distribution, with the 3rd class is most likely to be drawn,
+using logits..
+
+```python
+logits = [-50., -43, 0]
+dist = Multinomial(n=4., logits=logits)
+```
+
+Create a 3-class distribution, with the 3rd class is most likely to be drawn.
+
+```python
+p = [.2, .3, .5]
+dist = Multinomial(n=4., p=p)
+```
+
+The distribution functions can be evaluated on counts.
+
+```python
+# counts same shape as p.
+counts = [1., 0, 3]
+dist.prob(counts)  # Shape []
+
+# p will be broadcast to [[.2, .3, .5], [.2, .3, .5]] to match counts.
+counts = [[1., 2, 1], [2, 2, 0]]
+dist.prob(counts)  # Shape [2]
+
+# p will be broadcast to shape [5, 7, 3] to match counts.
+counts = [[...]]  # Shape [5, 7, 3]
+dist.prob(counts)  # Shape [5, 7]
+```
+
+Create a 2-batch of 3-class distributions.
+
+```python
+p = [[.1, .2, .7], [.3, .3, .4]]  # Shape [2, 3]
+dist = Multinomial(n=[4., 5], p=p)
+
+counts = [[2., 1, 1], [3, 1, 1]]
+dist.prob(counts)  # Shape [2]
+```
+- - -
+
+#### `tf.contrib.distributions.Multinomial.__init__(n, logits=None, p=None, validate_args=True, allow_nan_stats=False, name='Multinomial')` {#Multinomial.__init__}
+
+Initialize a batch of Multinomial distributions.
+
+##### Args:
+
+
+*  <b>`n`</b>: Non-negative floating point tensor with shape broadcastable to
+    `[N1,..., Nm]` with `m >= 0`. Defines this as a batch of
+    `N1 x ... x Nm` different Multinomial distributions.  Its components
+    should be equal to integer values.
+*  <b>`logits`</b>: Floating point tensor representing the log-odds of a
+    positive event with shape broadcastable to `[N1,..., Nm, k], m >= 0`,
+    and the same dtype as `n`. Defines this as a batch of `N1 x ... x Nm`
+    different `k` class Multinomial distributions.
+*  <b>`p`</b>: Positive floating point tensor with shape broadcastable to
+    `[N1,..., Nm, k]` `m >= 0` and same dtype as `n`.  Defines this as
+    a batch of `N1 x ... x Nm` different `k` class Multinomial
+    distributions. `p`'s components in the last portion of its shape should
+    sum up to 1.
+*  <b>`validate_args`</b>: Whether to assert valid values for parameters `n` and `p`,
+    and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
+    guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
+*  <b>`name`</b>: The name to prefix Ops created by this distribution class.
+
+
+*  <b>`Examples`</b>: 
+
+```python
+# Define 1-batch of 2-class multinomial distribution,
+# also known as a Binomial distribution.
+dist = Multinomial(n=2., p=[.1, .9])
+
+# Define a 2-batch of 3-class distributions.
+dist = Multinomial(n=[4., 5], p=[[.1, .3, .6], [.4, .05, .55]])
+```
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.allow_nan_stats` {#Multinomial.allow_nan_stats}
+
+Boolean describing behavior when a stat is undefined for batch member.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.batch_shape(name='batch_shape')` {#Multinomial.batch_shape}
+
+Batch dimensions of this instance as a 1-D int32 `Tensor`.
+
+The product of the dimensions of the `batch_shape` is the number of
+independent distributions of this kind the instance represents.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `batch_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.cdf(value, name='cdf')` {#Multinomial.cdf}
+
+Cumulative distribution function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.dtype` {#Multinomial.dtype}
+
+dtype of samples from this distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.entropy(name='entropy')` {#Multinomial.entropy}
+
+Entropy of the distribution in nats.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.event_shape(name='event_shape')` {#Multinomial.event_shape}
+
+Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `event_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.get_batch_shape()` {#Multinomial.get_batch_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `batch_shape`. May be only partially defined.
+
+##### Returns:
+
+  batch shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.get_event_shape()` {#Multinomial.get_event_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `event_shape`. May be only partially defined.
+
+##### Returns:
+
+  event shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.is_continuous` {#Multinomial.is_continuous}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.is_reparameterized` {#Multinomial.is_reparameterized}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_cdf(value, name='log_cdf')` {#Multinomial.log_cdf}
+
+Log CDF.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_pdf(value, name='log_pdf')` {#Multinomial.log_pdf}
+
+Log of the probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_pmf(value, name='log_pmf')` {#Multinomial.log_pmf}
+
+Log of the probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_prob(counts, name='log_prob')` {#Multinomial.log_prob}
+
+`Log(P[counts])`, computed for every batch member.
+
+For each batch of counts `[n_1,...,n_k]`, `P[counts]` is the probability
+that after sampling `n` draws from this Multinomial distribution, the
+number of draws falling in class `j` is `n_j`.  Note that different
+sequences of draws can result in the same counts, thus the probability
+includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can
+    be broadcast with `self.p` and `self.n`.  For fixed leading dimensions,
+    the last dimension represents counts for the corresponding Multinomial
+    distribution in `self.p`. `counts` is only legal if it sums up to `n`
+    and its components are equal to integer values.
+*  <b>`name`</b>: Name to give this Op, defaults to "log_prob".
+
+##### Returns:
+
+  Log probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.logits` {#Multinomial.logits}
+
+Log-odds.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.mean(name='mean')` {#Multinomial.mean}
+
+Mean of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.mode(name='mode')` {#Multinomial.mode}
+
+Mode of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.n` {#Multinomial.n}
+
+Number of trials.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.name` {#Multinomial.name}
+
+Name to prepend to all ops.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.p` {#Multinomial.p}
+
+Event probabilities.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.pdf(value, name='pdf')` {#Multinomial.pdf}
+
+The probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.pmf(value, name='pmf')` {#Multinomial.pmf}
+
+The probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.prob(counts, name='prob')` {#Multinomial.prob}
+
+`P[counts]`, computed for every batch member.
+
+For each batch of counts `[n_1,...,n_k]`, `P[counts]` is the probability
+that after sampling `n` draws from this Multinomial distribution, the
+number of draws falling in class `j` is `n_j`.  Note that different
+sequences of draws can result in the same counts, thus the probability
+includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can
+    be broadcast with `self.p` and `self.n`.  For fixed leading dimensions,
+    the last dimension represents counts for the corresponding Multinomial
+    distribution in `self.p`. `counts` is only legal if it sums up to `n`
+    and its components are equal to integer values.
+*  <b>`name`</b>: Name to give this Op, defaults to "prob".
+
+##### Returns:
+
+  Probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.sample(sample_shape=(), seed=None, name='sample')` {#Multinomial.sample}
+
+Generate samples of the specified shape for each batched distribution.
+
+Note that a call to `sample()` without arguments will generate a single
+sample per batched distribution.
+
+##### Args:
+
+
+*  <b>`sample_shape`</b>: `int32` `Tensor` or tuple or list. Shape of the generated
+    samples.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of dtype `self.dtype` and shape
+      `sample_shape + self.batch_shape + self.event_shape`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.sample_n(n, seed=None, name='sample_n')` {#Multinomial.sample_n}
+
+Generate `n` samples.
+
+##### Args:
+
+
+*  <b>`n`</b>: scalar. Number of samples to draw from each distribution.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of shape `(n,) + self.batch_shape + self.event_shape`
+      with values of type `self.dtype`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.std(name='std')` {#Multinomial.std}
+
+Standard deviation of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.validate_args` {#Multinomial.validate_args}
+
+Boolean describing behavior on invalid input.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.variance(name='variance')` {#Multinomial.variance}
+
+Variance of the distribution.
+
+
+
 
 ### Transformed distributions
 
@@ -6851,9 +7649,9 @@ Get the KL-divergence KL(dist_a || dist_b).
 
 *  <b>`dist_a`</b>: instance of distributions.Distribution.
 *  <b>`dist_b`</b>: instance of distributions.Distribution.
-*  <b>`allow_nan`</b>: If False (default), a runtime error is raised
+*  <b>`allow_nan`</b>: If `False` (default), a runtime error is raised
     if the KL returns NaN values for any batch entry of the given
-    distributions.  If True, the KL may return a NaN for the given entry.
+    distributions.  If `True`, the KL may return a NaN for the given entry.
 *  <b>`name`</b>: (optional) Name scope to use for created operations.
 
 ##### Returns:
@@ -6974,3 +7772,395 @@ Generate `n` samples.
 
 
 
+- - -
+
+### `class tf.contrib.distributions.MultivariateNormalDiagPlusVDVT` {#MultivariateNormalDiagPlusVDVT}
+
+The multivariate normal distribution on `R^k`.
+
+Every batch member of this distribution is defined by a mean and a lightweight
+covariance matrix `C`.
+
+#### Mathematical details
+
+The PDF of this distribution in terms of the mean `mu` and covariance `C` is:
+
+```
+f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
+```
+
+For every batch member, this distribution represents `k` random variables
+`(X_1,...,X_k)`, with mean `E[X_i] = mu[i]`, and covariance matrix
+`C_{ij} := E[(X_i - mu[i])(X_j - mu[j])]`
+
+The user initializes this class by providing the mean `mu`, and a lightweight
+definition of `C`:
+
+```
+C = SS^T = SS = (M + V D V^T) (M + V D V^T)
+M is diagonal (k x k)
+V = is shape (k x r), typically r << k
+D = is diagonal (r x r), optional (defaults to identity).
+```
+
+This allows for `O(kr + r^3)` pdf evaluation and determinant, and `O(kr)`
+sampling and storage (per batch member).
+
+#### Examples
+
+A single multi-variate Gaussian distribution is defined by a vector of means
+of length `k`, and square root of the covariance `S = M + V D V^T`.  Extra
+leading dimensions, if provided, allow for batches.
+
+```python
+# Initialize a single 3-variate Gaussian with covariance square root
+# S = M + V D V^T, where V D V^T is a matrix-rank 2 update.
+mu = [1, 2, 3.]
+diag_large = [1.1, 2.2, 3.3]
+v = ... # shape 3 x 2
+diag_small = [4., 5.]
+dist = tf.contrib.distributions.MultivariateNormalDiagPlusVDVT(
+    mu, diag_large, v, diag_small=diag_small)
+
+# Evaluate this on an observation in R^3, returning a scalar.
+dist.pdf([-1, 0, 1])
+
+# Initialize a batch of two 3-variate Gaussians.  This time, don't provide
+# diag_small.  This means S = M + V V^T.
+mu = [[1, 2, 3], [11, 22, 33]]  # shape 2 x 3
+diag_large = ... # shape 2 x 3
+v = ... # shape 2 x 3 x 1, a matrix-rank 1 update.
+dist = tf.contrib.distributions.MultivariateNormalDiagPlusVDVT(
+    mu, diag_large, v)
+
+# Evaluate this on a two observations, each in R^3, returning a length two
+# tensor.
+x = [[-1, 0, 1], [-11, 0, 11]]  # Shape 2 x 3.
+dist.pdf(x)
+```
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.__init__(mu, diag_large, v, diag_small=None, validate_args=True, allow_nan_stats=False, name='MultivariateNormalDiagPlusVDVT')` {#MultivariateNormalDiagPlusVDVT.__init__}
+
+Multivariate Normal distributions on `R^k`.
+
+For every batch member, this distribution represents `k` random variables
+`(X_1,...,X_k)`, with mean `E[X_i] = mu[i]`, and covariance matrix
+`C_{ij} := E[(X_i - mu[i])(X_j - mu[j])]`
+
+The user initializes this class by providing the mean `mu`, and a
+lightweight definition of `C`:
+
+```
+C = SS^T = SS = (M + V D V^T) (M + V D V^T)
+M is diagonal (k x k)
+V = is shape (k x r), typically r << k
+D = is diagonal (r x r), optional (defaults to identity).
+```
+
+##### Args:
+
+
+*  <b>`mu`</b>: Rank `n + 1` floating point tensor with shape `[N1,...,Nn, k]`,
+    `n >= 0`.  The means.
+*  <b>`diag_large`</b>: Optional rank `n + 1` floating point tensor, shape
+    `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `M`.
+*  <b>`v`</b>: Rank `n + 1` floating point tensor, shape `[N1,...,Nn, k, r]`
+    `n >= 0`.  Defines the matrix `V`.
+*  <b>`diag_small`</b>: Rank `n + 1` floating point tensor, shape
+    `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `D`.  Default
+    is `None`, which means `D` will be the identity matrix.
+*  <b>`validate_args`</b>: Whether to validate input with asserts.  If `validate_args`
+    is `False`,
+    and the inputs are invalid, correct behavior is not guaranteed.
+*  <b>`allow_nan_stats`</b>: `Boolean`, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
+*  <b>`name`</b>: The name to give Ops created by the initializer.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.allow_nan_stats` {#MultivariateNormalDiagPlusVDVT.allow_nan_stats}
+
+`Boolean` describing behavior when stats are undefined.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.batch_shape(name='batch_shape')` {#MultivariateNormalDiagPlusVDVT.batch_shape}
+
+Batch dimensions of this instance as a 1-D int32 `Tensor`.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.cdf(value, name='cdf')` {#MultivariateNormalDiagPlusVDVT.cdf}
+
+Cumulative distribution function.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.dtype` {#MultivariateNormalDiagPlusVDVT.dtype}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.entropy(name='entropy')` {#MultivariateNormalDiagPlusVDVT.entropy}
+
+The entropies of these Multivariate Normals.
+
+##### Args:
+
+
+*  <b>`name`</b>: The name to give this op.
+
+##### Returns:
+
+
+*  <b>`entropy`</b>: tensor of dtype `dtype`, the entropies.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.event_shape(name='event_shape')` {#MultivariateNormalDiagPlusVDVT.event_shape}
+
+Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.get_batch_shape()` {#MultivariateNormalDiagPlusVDVT.get_batch_shape}
+
+`TensorShape` available at graph construction time.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.get_event_shape()` {#MultivariateNormalDiagPlusVDVT.get_event_shape}
+
+`TensorShape` available at graph construction time.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.is_continuous` {#MultivariateNormalDiagPlusVDVT.is_continuous}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.is_reparameterized` {#MultivariateNormalDiagPlusVDVT.is_reparameterized}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_cdf(value, name='log_cdf')` {#MultivariateNormalDiagPlusVDVT.log_cdf}
+
+Log CDF.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_pdf(value, name='log_pdf')` {#MultivariateNormalDiagPlusVDVT.log_pdf}
+
+Log of the probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_pmf(value, name='log_pmf')` {#MultivariateNormalDiagPlusVDVT.log_pmf}
+
+Log of the probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_prob(x, name='log_prob')` {#MultivariateNormalDiagPlusVDVT.log_prob}
+
+Log prob of observations `x` given these Multivariate Normals.
+
+`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
+shape can be broadcast up to either:
+
+````
+self.batch_shape + self.event_shape
+OR
+[M1,...,Mm] + self.batch_shape + self.event_shape
+```
+
+##### Args:
+
+
+*  <b>`x`</b>: Compatible batch vector with same `dtype` as this distribution.
+*  <b>`name`</b>: The name to give this op.
+
+##### Returns:
+
+
+*  <b>`log_prob`</b>: tensor of dtype `dtype`, the log-PDFs of `x`.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_sigma_det(name='log_sigma_det')` {#MultivariateNormalDiagPlusVDVT.log_sigma_det}
+
+Log of determinant of covariance matrix.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.mean(name='mean')` {#MultivariateNormalDiagPlusVDVT.mean}
+
+Mean of each batch member.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.mode(name='mode')` {#MultivariateNormalDiagPlusVDVT.mode}
+
+Mode of each batch member.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.mu` {#MultivariateNormalDiagPlusVDVT.mu}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.name` {#MultivariateNormalDiagPlusVDVT.name}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.pdf(value, name='pdf')` {#MultivariateNormalDiagPlusVDVT.pdf}
+
+The probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.pmf(value, name='pmf')` {#MultivariateNormalDiagPlusVDVT.pmf}
+
+The probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.prob(x, name='prob')` {#MultivariateNormalDiagPlusVDVT.prob}
+
+The PDF of observations `x` under these Multivariate Normals.
+
+`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
+shape can be broadcast up to either:
+
+````
+self.batch_shape + self.event_shape
+OR
+[M1,...,Mm] + self.batch_shape + self.event_shape
+```
+
+##### Args:
+
+
+*  <b>`x`</b>: Compatible batch vector with same `dtype` as this distribution.
+*  <b>`name`</b>: The name to give this op.
+
+##### Returns:
+
+
+*  <b>`prob`</b>: tensor of dtype `dtype`, the prob values of `x`.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.sample(sample_shape=(), seed=None, name='sample')` {#MultivariateNormalDiagPlusVDVT.sample}
+
+Generate samples of the specified shape for each batched distribution.
+
+Note that a call to `sample()` without arguments will generate a single
+sample per batched distribution.
+
+##### Args:
+
+
+*  <b>`sample_shape`</b>: `int32` `Tensor` or tuple or list. Shape of the generated
+    samples.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of dtype `self.dtype` and shape
+      `sample_shape + self.batch_shape + self.event_shape`.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.sample_n(n, seed=None, name='sample_n')` {#MultivariateNormalDiagPlusVDVT.sample_n}
+
+Sample `n` observations from the Multivariate Normal Distributions.
+
+##### Args:
+
+
+*  <b>`n`</b>: `Scalar`, type int32, the number of observations to sample.
+*  <b>`seed`</b>: Python integer, the random seed.
+*  <b>`name`</b>: The name to give this op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: `[n, ...]`, a `Tensor` of `n` samples for each
+    of the distributions determined by broadcasting the hyperparameters.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.sigma` {#MultivariateNormalDiagPlusVDVT.sigma}
+
+Dense (batch) covariance matrix, if available.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.sigma_det(name='sigma_det')` {#MultivariateNormalDiagPlusVDVT.sigma_det}
+
+Determinant of covariance matrix.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.std(name='std')` {#MultivariateNormalDiagPlusVDVT.std}
+
+Standard deviation of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.validate_args` {#MultivariateNormalDiagPlusVDVT.validate_args}
+
+`Boolean` describing behavior on invalid input.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.variance(name='variance')` {#MultivariateNormalDiagPlusVDVT.variance}
+
+Variance of each batch member.
+
+
+
diff --git a/tensorflow/g3doc/api_docs/python/contrib.ffmpeg.md b/tensorflow/g3doc/api_docs/python/contrib.ffmpeg.md
index f069df8a55d..674ba5e2b0e 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.ffmpeg.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.ffmpeg.md
@@ -46,7 +46,8 @@ Create an op that decodes the contents of an audio file.
 
   A rank 2 tensor that has time along dimension 0 and channels along
   dimension 1. Dimension 0 will be `samples_per_second * length` wide, and
-  dimension 1 will be `channel_count` wide.
+  dimension 1 will be `channel_count` wide. If ffmpeg fails to decode the
+  audio then an empty tensor will be returned.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/contrib.framework.md b/tensorflow/g3doc/api_docs/python/contrib.framework.md
index df4df30d199..0a6c8119248 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.framework.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.framework.md
@@ -324,15 +324,14 @@ Assert tensors are the same shape, from the same graph.
 
 Decorator for marking functions or methods deprecated.
 
-This decorator adds a deprecation warning to a function's docstring. It has
-the following format:
+This decorator logs a deprecation warning whenever the decorated function is
+called. It has the following format:
 
   <function> (from <module>) is deprecated and will be removed after <date>.
   Instructions for updating:
   <instructions>
 
-whenever the decorated function is called. <function> will include the class
-name if it is a method.
+<function> will include the class name if it is a method.
 
 It also edits the docstring of the function: ' (deprecated)' is appended
 to the first line of the docstring and a deprecation notice is prepended
@@ -356,6 +355,44 @@ to the rest of the docstring.
 *  <b>`ValueError`</b>: If date is not in ISO 8601 format, or instructions are empty.
 
 
+- - -
+
+### `tf.contrib.framework.deprecated_arg_values(date, instructions, **deprecated_kwargs)` {#deprecated_arg_values}
+
+Decorator for marking specific function argument values as deprecated.
+
+This decorator logs a deprecation warning whenever the decorated function is
+called with the deprecated argument values. It has the following format:
+
+  Calling <function> (from <module>) with <arg>=<value> is deprecated and
+  will be removed after <date>. Instructions for updating:
+    <instructions>
+
+<function> will include the class name if it is a method.
+
+It also edits the docstring of the function: ' (deprecated arguments)' is
+appended to the first line of the docstring and a deprecation notice is
+prepended to the rest of the docstring.
+
+##### Args:
+
+
+*  <b>`date`</b>: String. The date the function is scheduled to be removed. Must be
+    ISO 8601 (YYYY-MM-DD).
+*  <b>`instructions`</b>: String. Instructions on how to update code using the
+    deprecated function.
+*  <b>`**deprecated_kwargs`</b>: The deprecated argument values.
+
+##### Returns:
+
+  Decorated function or method.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If date is not in ISO 8601 format, or instructions are empty.
+
+
 
 ## Arg_Scope
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/contrib.graph_editor.md b/tensorflow/g3doc/api_docs/python/contrib.graph_editor.md
new file mode 100644
index 00000000000..be6fa7bde55
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/contrib.graph_editor.md
@@ -0,0 +1,859 @@
+<!-- This file is machine generated: DO NOT EDIT! -->
+
+# Graph Editor (contrib)
+[TOC]
+
+Graph editor module allows to modify an existing graph in place.
+
+## Other Functions and Classes
+- - -
+
+### `class tf.contrib.graph_editor.SubGraphView` {#SubGraphView}
+
+A subgraph view on an existing tf.Graph.
+
+An instance of this class is a subgraph view on an existing tf.Graph.
+"subgraph" means that it can represent part of the whole tf.Graph.
+"view" means that it only provides a passive observation and do not to act
+on the tf.Graph. Note that in this documentation, the term "subgraph" is often
+used as substitute to "subgraph view".
+
+A subgraph contains:
+- a list of input tensors, accessible via the "inputs" property.
+- a list of output tensors, accessible via the "outputs" property.
+- and the operations in between, accessible via the "ops" property.
+
+An subgraph can be seen as a function F(i0, i1, ...) -> o0, o1, ... It is a
+function which takes as input some input tensors and returns as output some
+output tensors. The computation that the function performs is encoded in the
+operations of the subgraph.
+
+The tensors (input or output) can be of two kinds:
+- connected: a connected tensor connects to at least one operation contained
+in the subgraph. One example is a subgraph representing a single operation
+and its inputs and outputs: all the input and output tensors of the op
+are "connected".
+- passthrough: a passthrough tensor does not connect to any operation
+contained in the subgraph. One example is a subgraph representing a
+single tensor: this tensor is passthrough. By default a passthrough tensor is
+present both in the input and output tensors of the subgraph. It can however
+be remapped to only appear as an input (or output) only.
+
+The input and output tensors can be remapped. For instance, some input tensor
+can be ommited. For instance, a subgraph representing an operation with two
+inputs can be remapped to only take one input. Note that this does not change
+at all the underlying tf.Graph (remember, it is a view). It means that
+the other input is being ignored, or is being treated as "given".
+The analogy with functions can be extended like this: F(x,y) is the original
+function. Remapping the inputs from [x, y] to just [x] means that the subgraph
+now represent the function F_y(x) (y is "given").
+
+The output tensors can also be remapped. For instance, some output tensor can
+be ommited. Other output tensor can be duplicated as well. As mentioned
+before, this does not change at all the underlying tf.Graph.
+The analogy with functions can be extended like this: F(...)->x,y is the
+original function. Remapping the outputs from [x, y] to just [y,y] means that
+the subgraph now represent the function M(F(...)) where M is the function
+M(a,b)->b,b.
+
+It is useful to describe three other kind of tensors:
+- internal: an internal tensor is a tensor connecting operations contained
+in the subgraph. One example in the subgraph representing the two operations
+A and B connected sequentially: -> A -> B ->. The middle arrow is an internal
+tensor.
+- actual input: an input tensor of the subgraph, regardless of whether it is
+  listed in "inputs" or not (masked-out).
+- actual output: an output tensor of the subgraph, regardless of whether it is
+  listed in "outputs" or not (masked-out).
+- hidden input: an actual input which has been masked-out using an
+  input remapping. In other word, a hidden input is a non-internal tensor
+  not listed as a input tensor and one of whose consumers belongs to
+  the subgraph.
+- hidden output: a actual output which has been masked-out using an output
+  remapping. In other word, a hidden output is a non-internal tensor
+  not listed as an output and one of whose generating operations belongs to
+  the subgraph.
+
+Here are some usefull guarantees about an instance of a SubGraphView:
+- the input (or output) tensors are not internal.
+- the input (or output) tensors are either "connected" or "passthrough".
+- the passthrough tensors are not connected to any of the operation of
+the subgraph.
+
+Note that there is no guarantee that an operation in a subgraph contributes
+at all to its inputs or outputs. For instance, remapping both the inputs and
+outputs to empty lists will produce a subgraph which still contains all the
+original operations. However, the remove_unused_ops function can be used to
+make a new subgraph view whose operations are connected to at least one of
+the input or output tensors.
+
+An instance of this class is meant to be a lightweight object which is not
+modified in-place by the user. Rather, the user can create new modified
+instances of a given subgraph. In that sense, the class SubGraphView is meant
+to be used like an immutable python object.
+
+A common problem when using views is that they can get out-of-sync with the
+data they observe (in this case, a tf.Graph). This is up to the user to insure
+that this doesn't happen. To keep on the safe sife, it is recommended that
+the life time of subgraph views are kept very short. One way to achieve this
+is to use subgraphs within a "with make_sgv(...) as sgv:" Python context.
+
+To alleviate the out-of-sync problem, some functions are granted the right to
+modified subgraph in place. This is typically the case of graph manipulation
+functions which, given some subgraphs as arguments, can modify the underlying
+tf.Graph. Since this modification is likely to render the subgraph view
+invalid, those functions can modify the argument in place to reflect the
+change. For instance, calling the function swap_inputs(svg0, svg1) will modify
+svg0 and svg1 in place to reflect the fact that their inputs have now being
+swapped.
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.__init__(inside_ops=(), passthrough_ts=())` {#SubGraphView.__init__}
+
+Create a subgraph containing the given ops and the "passthrough" tensors.
+
+##### Args:
+
+
+*  <b>`inside_ops`</b>: an object convertible to a list of tf.Operation. This list
+    defines all the operations in the subgraph.
+*  <b>`passthrough_ts`</b>: an object convertible to a list of tf.Tensor. This list
+    define all the "passthrough" tensors. A passthrough tensor is a tensor
+    which goes directly from the input of the subgraph to it output, without
+    any intermediate operations. All the non passthrough tensors are
+    silently ignored.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if inside_ops cannot be converted to a list of tf.Operation or
+    if passthrough_ts cannot be converted to a list of tf.Tensor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.connected_inputs` {#SubGraphView.connected_inputs}
+
+The connected input tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.connected_outputs` {#SubGraphView.connected_outputs}
+
+The connected output tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.consumers()` {#SubGraphView.consumers}
+
+Return a Python set of all the consumers of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.copy()` {#SubGraphView.copy}
+
+Return a copy of itself.
+
+Note that this class is a "view", copying it only create another view and
+does not copy the underlying part of the tf.Graph.
+
+##### Returns:
+
+  a new instance identical to the original one.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.find_op_by_name(op_name)` {#SubGraphView.find_op_by_name}
+
+Return the op named op_name.
+
+##### Args:
+
+
+*  <b>`op_name`</b>: the name to search for
+
+##### Returns:
+
+  The op named op_name.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if the op_name could not be found.
+*  <b>`AssertionError`</b>: if the name was found multiple time.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.graph` {#SubGraphView.graph}
+
+The underlying tf.Graph.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.input_index(t)` {#SubGraphView.input_index}
+
+Find the input index corresponding to the given input tensor t.
+
+##### Args:
+
+
+*  <b>`t`</b>: the input tensor of this subgraph view.
+
+##### Returns:
+
+  the index in the self.inputs list.
+
+##### Raises:
+
+
+*  <b>`Error`</b>: if t in not an input tensor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.inputs` {#SubGraphView.inputs}
+
+The input tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.is_passthrough(t)` {#SubGraphView.is_passthrough}
+
+Check whether a tensor is passthrough.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.op(op_id)` {#SubGraphView.op}
+
+Get an op by its index.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.ops` {#SubGraphView.ops}
+
+The operations in this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.output_index(t)` {#SubGraphView.output_index}
+
+Find the output index corresponding to given output tensor t.
+
+##### Args:
+
+
+*  <b>`t`</b>: the output tensor of this subgraph view.
+
+##### Returns:
+
+  the index in the self.outputs list.
+
+##### Raises:
+
+
+*  <b>`Error`</b>: if t in not an output tensor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.outputs` {#SubGraphView.outputs}
+
+The output tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.passthroughs` {#SubGraphView.passthroughs}
+
+The passthrough tensors, going straight from input to output.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap(new_input_indices=None, new_output_indices=None)` {#SubGraphView.remap}
+
+Remap the inputs and outputs of the subgraph.
+
+Note that this is only modifying the view: the underlying tf.Graph is not
+affected.
+
+##### Args:
+
+
+*  <b>`new_input_indices`</b>: an iterable of integers representing a mapping between
+    the old inputs and the new ones. This mapping can be under-complete and
+    must be without repetitions.
+*  <b>`new_output_indices`</b>: an iterable of integers representing a mapping between
+    the old outputs and the new ones. This mapping can be under-complete and
+    can have repetitions.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with remapped
+    inputs and outputs.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_default(remove_input_map=True, remove_output_map=True)` {#SubGraphView.remap_default}
+
+Remap the inputs and/or outputs to the default mapping.
+
+##### Args:
+
+
+*  <b>`remove_input_map`</b>: if True the input map is reset to the default one.
+*  <b>`remove_output_map`</b>: if True the output map is reset to the default one.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with its
+    input and/or output mapping reset to the default one.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_inputs(new_input_indices)` {#SubGraphView.remap_inputs}
+
+Remap the inputs of the subgraph.
+
+If the inputs of the original subgraph are [t0, t1, t2], remapping to [2,0]
+will create a new instance whose inputs is [t2, t0].
+
+Note that this is only modifying the view: the underlying tf.Graph is not
+affected.
+
+##### Args:
+
+
+*  <b>`new_input_indices`</b>: an iterable of integers representing a mapping between
+    the old inputs and the new ones. This mapping can be under-complete and
+    must be without repetitions.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with remapped
+    inputs.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_outputs(new_output_indices)` {#SubGraphView.remap_outputs}
+
+Remap the output of the subgraph.
+
+If the output of the original subgraph are [t0, t1, t2], remapping to
+[1,1,0] will create a new instance whose outputs is [t1, t1, t0].
+
+Note that this is only modifying the view: the underlying tf.Graph is not
+affected.
+
+##### Args:
+
+
+*  <b>`new_output_indices`</b>: an iterable of integers representing a mapping between
+    the old outputs and the new ones. This mapping can be under-complete and
+    can have repetitions.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with remapped
+    outputs.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_outputs_make_unique()` {#SubGraphView.remap_outputs_make_unique}
+
+Remap the outputs so that all the tensors appears only once.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_outputs_to_consumers()` {#SubGraphView.remap_outputs_to_consumers}
+
+Remap the outputs to match the number of consumers.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remove_unused_ops(control_inputs=True)` {#SubGraphView.remove_unused_ops}
+
+Remove unused ops.
+
+##### Args:
+
+
+*  <b>`control_inputs`</b>: if True, control inputs are used to detect used ops.
+
+##### Returns:
+
+  A new subgraph view which only contains used operations.
+
+
+
+- - -
+
+### `class tf.contrib.graph_editor.Transformer` {#Transformer}
+
+Transform a subgraph into another one.
+
+By default, the constructor create a transform which copy a subgraph and
+replaces inputs with placeholders. This behavior can be modified by changing
+the handlers.
+- - -
+
+#### `tf.contrib.graph_editor.Transformer.__init__()` {#Transformer.__init__}
+
+Transformer constructor.
+
+The following members can be modified:
+transform_op_handler: handle the transformation of a tf.Operation.
+  This handler defaults to a simple copy.
+assign_collections_handler: handle the assignment of collections.
+  This handler defaults to assigning new collections created under the
+  given name-scope.
+transform_input_handler: handle the transform of the inputs to the given
+  subgraph. This handler defaults to creating placeholders instead of the
+  ops just before the input tensors of the subgraph.
+transform_hidden_input_handler: handle the transform of the hidden inputs of
+  the subgraph, that is, the inputs which are not listed in sgv.inputs.
+  This handler defaults to a transform which keep the same input if the
+  source and destination graphs are the same, otherwise use placeholders.
+transform_original_op_hanlder: handle the transform of original_op. This
+  handler defaults to transforming original_op only if they are in the
+  subgraph, otherwise they are ignored.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.Transformer.new_name(name)` {#Transformer.new_name}
+
+Compute a destination name from a source name.
+
+##### Args:
+
+
+*  <b>`name`</b>: the name to be "transformed".
+
+##### Returns:
+
+  the transformed name.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if the source scope is used (that is, not an empty string)
+    and the source name does not belong to the source scope.
+
+
+
+- - -
+
+### `tf.contrib.graph_editor.bypass(sgv)` {#bypass}
+
+Bypass the given subgraph by connecting its inputs to its outputs.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be bypassed. This argument is converted to a
+    subgraph using the same rules than the function subgraph.make_view.
+
+##### Returns:
+
+  A new subgraph view of the bypassed subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
+
+- - -
+
+### `tf.contrib.graph_editor.connect(sgv0, sgv1, disconnect_first=False)` {#connect}
+
+Connect the outputs of sgv0 to the inputs of sgv1.
+
+##### Args:
+
+
+*  <b>`sgv0`</b>: the first subgraph to have its outputs swapped. This argument is
+    converted to a subgraph using the same rules as the function
+    subgraph.make_view.
+*  <b>`sgv1`</b>: the second subgraph to have its outputs swapped. This argument is
+    converted to a subgraph using the same rules as the function
+    subgraph.make_view.
+*  <b>`disconnect_first`</b>: if True the current outputs of sgv0 are disconnected.
+
+##### Returns:
+
+  Two new subgraph views (now connected). sgv0 and svg1 are also modified
+    in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv0 or sgv1 cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
+
+- - -
+
+### `tf.contrib.graph_editor.detach(sgv, control_inputs=False, control_outputs=None, control_ios=None)` {#detach}
+
+Detach both the inputs and the outputs of a subgraph view.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
+    subgraph using the same rules as the function subgraph.make_view.
+*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
+*  <b>`control_outputs`</b>: An instance of util.ControlOutputs or None. If not None,
+    control outputs are enabled.
+*  <b>`control_ios`</b>: An instance of util.ControlOutputs or None. If not None, both
+    control inputs and control outputs are enabled. This is equivalent to set
+    control_inputs to True and control_outputs to the util.ControlOutputs
+    instance.
+
+##### Returns:
+
+  A new subgraph view of the detached subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
+
+- - -
+
+### `tf.contrib.graph_editor.detach_inputs(sgv, control_inputs=False)` {#detach_inputs}
+
+Detach the inputs of a subgraph view.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
+    subgraph using the same rules as the function subgraph.make_view.
+*  <b>`control_inputs`</b>: if True control_inputs are also detached.
+
+##### Returns:
+
+  A new subgraph view of the detached subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
+
+- - -
+
+### `tf.contrib.graph_editor.detach_outputs(sgv, control_outputs=None)` {#detach_outputs}
+
+Detach the outputa of a subgraph view.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
+    subgraph using the same rules as the function subgraph.make_view.
+*  <b>`control_outputs`</b>: a util.ControlOutputs instance or None. If not None the
+    control outputs are also detached.
+
+##### Returns:
+
+  A new subgraph view of the detached subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
+
+- - -
+
+### `class tf.contrib.graph_editor.matcher` {#matcher}
+
+Graph match class.
+- - -
+
+#### `tf.contrib.graph_editor.matcher.__init__(positive_filter)` {#matcher.__init__}
+
+Graph match constructor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.matcher.control_input_ops(*args)` {#matcher.control_input_ops}
+
+Add input matches.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.matcher.input_ops(*args)` {#matcher.input_ops}
+
+Add input matches.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.matcher.output_ops(*args)` {#matcher.output_ops}
+
+Add output matches.
+
+
+
+- - -
+
+### `tf.contrib.graph_editor.ph(dtype, shape=None, scope=None)` {#ph}
+
+Create a tf.placeholder for the Graph Editor.
+
+Note that the correct graph scope must be set by the calling function.
+The placeholder is named using the function placeholder_name (with no
+tensor argument).
+
+##### Args:
+
+
+*  <b>`dtype`</b>: the tensor type.
+*  <b>`shape`</b>: the tensor shape (optional).
+*  <b>`scope`</b>: absolute scope within which to create the placeholder. None
+    means that the scope of t is preserved. "" means the root scope.
+
+##### Returns:
+
+  A newly created tf.placeholder.
+
+
+- - -
+
+### `tf.contrib.graph_editor.reroute_a2b(sgv0, sgv1)` {#reroute_a2b}
+
+Re-route the inputs and outputs of sgv0 to sgv1 (see _reroute).
+
+
+- - -
+
+### `tf.contrib.graph_editor.reroute_a2b_inputs(sgv0, sgv1)` {#reroute_a2b_inputs}
+
+Re-route all the inputs of sgv0 to sgv1 (see reroute_inputs).
+
+
+- - -
+
+### `tf.contrib.graph_editor.reroute_a2b_outputs(sgv0, sgv1)` {#reroute_a2b_outputs}
+
+Re-route all the outputs of sgv0 to sgv1 (see _reroute_outputs).
+
+
+- - -
+
+### `tf.contrib.graph_editor.reroute_b2a(sgv0, sgv1)` {#reroute_b2a}
+
+Re-route the inputs and outputs of sgv1 to sgv0 (see _reroute).
+
+
+- - -
+
+### `tf.contrib.graph_editor.reroute_b2a_inputs(sgv0, sgv1)` {#reroute_b2a_inputs}
+
+Re-route all the inputs of sgv1 to sgv0 (see reroute_inputs).
+
+
+- - -
+
+### `tf.contrib.graph_editor.reroute_b2a_outputs(sgv0, sgv1)` {#reroute_b2a_outputs}
+
+Re-route all the outputs of sgv1 to sgv0 (see _reroute_outputs).
+
+
+- - -
+
+### `tf.contrib.graph_editor.select_ops(*args, **kwargs)` {#select_ops}
+
+Helper to select operations.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Operation. tf.Tensor instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ops_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ops)".
+
+##### Returns:
+
+  list of tf.Operation
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Operation
+    or an (array of) tf.Tensor (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
+
+- - -
+
+### `tf.contrib.graph_editor.select_ts(*args, **kwargs)` {#select_ts}
+
+Helper to select tensors.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Tensor. tf.Operation instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ts_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ts)".
+
+##### Returns:
+
+  list of tf.Tensor
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Tensor
+    or an (array of) tf.Operation (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
+
+- - -
+
+### `tf.contrib.graph_editor.sgv(*args, **kwargs)` {#sgv}
+
+Create a SubGraphView from selected operations and passthrough tensors.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Operation 3) (array of) tf.Tensor. Those objects will be converted
+    into a list of operations and a list of candidate for passthrough tensors.
+*  <b>`**kwargs`</b>: keyword graph is used 1) to check that the ops and ts are from
+    the correct graph 2) for regular expression query
+
+##### Returns:
+
+  A subgraph view.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Tensor
+    or an (array of) tf.Operation or a string or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected.
+
+
+- - -
+
+### `tf.contrib.graph_editor.sgv_scope(scope, graph)` {#sgv_scope}
+
+Make a subgraph from a name scope.
+
+##### Args:
+
+
+*  <b>`scope`</b>: the name of the scope.
+*  <b>`graph`</b>: the tf.Graph.
+
+##### Returns:
+
+  A subgraph view representing the given scope.
+
+
+- - -
+
+### `tf.contrib.graph_editor.swap(sgv0, sgv1)` {#swap}
+
+Swap the inputs and outputs of sgv1 to sgv0 (see _reroute).
+
+
+- - -
+
+### `tf.contrib.graph_editor.swap_inputs(sgv0, sgv1)` {#swap_inputs}
+
+Swap all the inputs of sgv0 and sgv1 (see reroute_inputs).
+
+
+- - -
+
+### `tf.contrib.graph_editor.swap_outputs(sgv0, sgv1)` {#swap_outputs}
+
+Swap all the outputs of sgv0 and sgv1 (see _reroute_outputs).
+
+
+- - -
+
+### `tf.contrib.graph_editor.ts(*args, **kwargs)` {#ts}
+
+Helper to select tensors.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Tensor. tf.Operation instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ts_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ts)".
+
+##### Returns:
+
+  list of tf.Tensor
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Tensor
+    or an (array of) tf.Operation (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/contrib.layers.md b/tensorflow/g3doc/api_docs/python/contrib.layers.md
index 914eb0f581f..beb2ad8ea23 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.layers.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.layers.md
@@ -15,28 +15,27 @@ common machine learning algorithms.
 
 ### `tf.contrib.layers.avg_pool2d(*args, **kwargs)` {#avg_pool2d}
 
-Adds a Avg Pooling op.
+Adds a 2D average pooling op.
 
-It is assumed by the wrapper that the pooling is only done per image and not
-in depth or batch.
+It is assumed that the pooling is done per image but not in batch or channels.
 
 ##### Args:
 
 
-*  <b>`inputs`</b>: a tensor of size [batch_size, height, width, depth].
-*  <b>`kernel_size`</b>: a list of length 2: [kernel_height, kernel_width] of the
+*  <b>`inputs`</b>: A `Tensor` of size [batch_size, height, width, channels].
+*  <b>`kernel_size`</b>: A list of length 2: [kernel_height, kernel_width] of the
     pooling kernel over which the op is computed. Can be an int if both
     values are the same.
-*  <b>`stride`</b>: a list of length 2: [stride_height, stride_width].
-    Can be an int if both strides are the same.  Note that presently
+*  <b>`stride`</b>: A list of length 2: [stride_height, stride_width].
+    Can be an int if both strides are the same. Note that presently
     both strides must have the same value.
-*  <b>`padding`</b>: the padding method, either 'VALID' or 'SAME'.
-*  <b>`outputs_collections`</b>: collection to add the outputs.
+*  <b>`padding`</b>: The padding method, either 'VALID' or 'SAME'.
+*  <b>`outputs_collections`</b>: The collections to which the outputs are added.
 *  <b>`scope`</b>: Optional scope for op_scope.
 
 ##### Returns:
 
-  a tensor representing the results of the pooling operation.
+  A `Tensor` representing the results of the pooling operation.
 
 
 - - -
@@ -327,33 +326,32 @@ prior to the initial matrix multiply by `weights`.
 
 ### `tf.contrib.layers.max_pool2d(*args, **kwargs)` {#max_pool2d}
 
-Adds a Max Pooling op.
+Adds a 2D Max Pooling op.
 
-It is assumed by the wrapper that the pooling is only done per image and not
-in depth or batch.
+It is assumed that the pooling is done per image but not in batch or channels.
 
 ##### Args:
 
 
-*  <b>`inputs`</b>: a tensor of size [batch_size, height, width, depth].
-*  <b>`kernel_size`</b>: a list of length 2: [kernel_height, kernel_width] of the
+*  <b>`inputs`</b>: A `Tensor` of size [batch_size, height, width, channels].
+*  <b>`kernel_size`</b>: A list of length 2: [kernel_height, kernel_width] of the
     pooling kernel over which the op is computed. Can be an int if both
     values are the same.
-*  <b>`stride`</b>: a list of length 2: [stride_height, stride_width].
-    Can be an int if both strides are the same.  Note that presently
+*  <b>`stride`</b>: A list of length 2: [stride_height, stride_width].
+    Can be an int if both strides are the same. Note that presently
     both strides must have the same value.
-*  <b>`padding`</b>: the padding method, either 'VALID' or 'SAME'.
-*  <b>`outputs_collections`</b>: collection to add the outputs.
+*  <b>`padding`</b>: The padding method, either 'VALID' or 'SAME'.
+*  <b>`outputs_collections`</b>: The collections to which the outputs are added.
 *  <b>`scope`</b>: Optional scope for op_scope.
 
 ##### Returns:
 
-  a tensor representing the results of the pooling operation.
+  A `Tensor` representing the results of the pooling operation.
 
 ##### Raises:
 
 
-*  <b>`ValueError`</b>: if 'kernel_size' is not a 2-D list
+*  <b>`ValueError`</b>: If 'kernel_size' is not a 2-D list
 
 
 - - -
@@ -767,7 +765,7 @@ Optimize weights given a loss.
 
 - - -
 
-### `tf.contrib.layers.optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, moving_average_decay=0.9, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None)` {#optimize_loss}
+### `tf.contrib.layers.optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, moving_average_decay=None, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None, summaries=None)` {#optimize_loss}
 
 Given loss and parameters for optimizer, returns a training op.
 
@@ -790,8 +788,8 @@ Given loss and parameters for optimizer, returns a training op.
                         If present, gradients for specified
                         variables will be multiplied by given constant.
 *  <b>`clip_gradients`</b>: float or `None`, clips gradients by this value.
-*  <b>`moving_average_decay`</b>: float or None, takes into account previous loss
-                        to make learning smoother due to outliers.
+*  <b>`moving_average_decay`</b>: Deprecated. float or None, takes into account previous
+                        loss to make learning smoother due to outliers.
 *  <b>`learning_rate_decay_fn`</b>: function, takes `learning_rate` and `global_step`
                           `Tensor`s, returns `Tensor`.
                           Can be used to implement any learning rate decay
@@ -802,6 +800,9 @@ Given loss and parameters for optimizer, returns a training op.
 *  <b>`variables`</b>: list of variables to optimize or
              `None` to use all trainable variables.
 *  <b>`name`</b>: The name for this operation is used to scope operations and summaries.
+*  <b>`summaries`</b>: List of internal quantities to visualize on tensorboard. If not
+             set only the loss and the learning rate will be reported. The
+             complete list is in OPTIMIZER_SUMMARIES.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.md b/tensorflow/g3doc/api_docs/python/contrib.learn.md
index 6faeaee9dbb..a7214b2242a 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.learn.md
@@ -31,9 +31,9 @@ Initializes a BaseEstimator instance.
 ##### Args:
 
 
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
+    also be used to load checkpoints from the directory into a estimator to
+    continue training a previously saved model.
 *  <b>`config`</b>: A RunConfig instance.
 
 
@@ -41,56 +41,7 @@ Initializes a BaseEstimator instance.
 
 #### `tf.contrib.learn.BaseEstimator.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#BaseEstimator.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -104,37 +55,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.BaseEstimator.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#BaseEstimator.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
@@ -340,9 +261,9 @@ Constructs an Estimator instance.
              to configure Estimators from hyper parameter tunning.
 
 
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
+    also be used to load checkpoints from the directory into a estimator to
+    continue training a previously saved model.
 *  <b>`config`</b>: Configuration object.
 *  <b>`params`</b>: `dict` of hyper parameters that will be passed into `model_fn`.
           Keys are names of parameters, values are basic python types.
@@ -357,56 +278,7 @@ Constructs an Estimator instance.
 
 #### `tf.contrib.learn.Estimator.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#Estimator.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -420,37 +292,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.Estimator.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#Estimator.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
@@ -667,56 +509,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -1030,56 +823,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.DNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#DNNClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -1093,37 +837,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.DNNClassifier.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#DNNClassifier.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
@@ -1447,56 +1161,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.DNNRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#DNNRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -1510,37 +1175,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.DNNRegressor.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#DNNRegressor.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
@@ -1766,56 +1401,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowDNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowDNNClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -2040,56 +1626,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowDNNRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowDNNRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -2655,9 +2192,9 @@ Construct a `LinearClassifier` estimator object.
 *  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
     the model. All items in the set should be instances of classes derived
     from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
+    also be used to load checkpoints from the directory into a estimator
+    to continue training a previously saved model.
 *  <b>`n_classes`</b>: number of target classes. Default is binary classification.
 *  <b>`weight_column_name`</b>: A string defining feature column name representing
     weights. It is used to down weight or boost examples during training. It
@@ -2703,56 +2240,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.LinearClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#LinearClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -2766,37 +2254,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.LinearClassifier.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#LinearClassifier.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
@@ -3056,9 +2514,9 @@ Construct a `LinearRegressor` estimator object.
 *  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
     the model. All items in the set should be instances of classes derived
     from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph, etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph, etc. This can
+    also be used to load checkpoints from the directory into a estimator
+    to continue training a previously saved model.
 *  <b>`weight_column_name`</b>: A string defining feature column name representing
     weights. It is used to down weight or boost examples during training. It
     will be multiplied by the loss of the example.
@@ -3103,56 +2561,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.LinearRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#LinearRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -3166,37 +2575,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.LinearRegressor.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#LinearRegressor.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
@@ -3422,56 +2801,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowLinearClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowLinearClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -3696,56 +3026,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowLinearRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowLinearRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -4615,56 +3896,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md b/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
index 56b7879acff..04e0ba140e8 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.learn.monitors.md
@@ -469,38 +469,32 @@ Returns the values captured so far.
 Saves checkpoints every N steps.
 - - -
 
-#### `tf.contrib.learn.monitors.CheckpointSaver.__init__(every_n_steps, saver, checkpoint_dir, checkpoint_basename='model.ckpt', first_n_steps=-1)` {#CheckpointSaver.__init__}
+#### `tf.contrib.learn.monitors.CheckpointSaver.__init__(checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename='model.ckpt', scaffold=None)` {#CheckpointSaver.__init__}
 
 Initialize CheckpointSaver monitor.
 
 ##### Args:
 
 
-*  <b>`every_n_steps`</b>: `int`, save every N steps.
-*  <b>`saver`</b>: `Saver` object, used for saving.
 *  <b>`checkpoint_dir`</b>: `str`, base directory for the checkpoint files.
+*  <b>`save_secs`</b>: `int`, save every N secs.
+*  <b>`save_steps`</b>: `int`, save every N steps.
+*  <b>`saver`</b>: `Saver` object, used for saving.
 *  <b>`checkpoint_basename`</b>: `str`, base name for the checkpoint files.
-*  <b>`first_n_steps`</b>: `int`, if positive, save every step during the
-    first `first_n_steps` steps.
+*  <b>`scaffold`</b>: `Scaffold`, use to get saver object.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If both `save_steps` and `save_secs` are not `None`.
+*  <b>`ValueError`</b>: If both `save_steps` and `save_secs` are `None`.
 
 
 - - -
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.begin(max_steps=None)` {#CheckpointSaver.begin}
 
-Called at the beginning of training.
 
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
 
 
 - - -
@@ -544,55 +538,6 @@ End epoch.
 *  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
 
 
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.every_n_post_step(step, session)` {#CheckpointSaver.every_n_post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.every_n_step_begin(step)` {#CheckpointSaver.every_n_step_begin}
-
-Callback before every n'th step begins.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list` of tensors that will be evaluated at this step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.every_n_step_end(step, outputs)` {#CheckpointSaver.every_n_step_end}
-
-Callback after every n'th step finished.
-
-This callback provides access to the tensors/ops evaluated at this step,
-including the additional tensors for which evaluation was requested in
-`step_begin`.
-
-In addition, the callback has the opportunity to stop training by returning
-`True`. This is useful for early stopping, for example.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`outputs`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`. True if training should stop.
-
-
 - - -
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.post_step(step, session)` {#CheckpointSaver.post_step}
@@ -628,33 +573,24 @@ A setter called automatically by the target estimator.
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.step_begin(step)` {#CheckpointSaver.step_begin}
 
-Overrides `BaseMonitor.step_begin`.
 
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
 
 
 - - -
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.step_end(step, output)` {#CheckpointSaver.step_end}
 
-Overrides `BaseMonitor.step_end`.
+Callback after training step finished.
 
-When overriding this method, you must call the super implementation.
+This callback provides access to the tensors/ops evaluated at this step,
+including the additional tensors for which evaluation was requested in
+`step_begin`.
+
+In addition, the callback has the opportunity to stop training by returning
+`True`. This is useful for early stopping, for example.
+
+Note that this method is not called if the call to `Session.run()` that
+followed the last call to `step_begin()` failed.
 
 ##### Args:
 
@@ -666,8 +602,12 @@ When overriding this method, you must call the super implementation.
 
 ##### Returns:
 
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
+  `bool`. True if training should stop.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if we've not begun a step, or `step` number does not match.
 
 
 
@@ -2173,7 +2113,7 @@ A setter called automatically by the target estimator.
 Saves summaries every N steps.
 - - -
 
-#### `tf.contrib.learn.monitors.SummarySaver.__init__(summary_op, save_steps=100, output_dir=None, summary_writer=None)` {#SummarySaver.__init__}
+#### `tf.contrib.learn.monitors.SummarySaver.__init__(summary_op, save_steps=100, output_dir=None, summary_writer=None, scaffold=None)` {#SummarySaver.__init__}
 
 Initializes a `SummarySaver` monitor.
 
@@ -2188,6 +2128,7 @@ Initializes a `SummarySaver` monitor.
       if no `summary_writer` is supplied.
 *  <b>`summary_writer`</b>: `SummaryWriter`. If `None` and an `output_dir` was passed,
       one will be created accordingly.
+*  <b>`scaffold`</b>: `Scaffold` to get summary_op if it's not provided.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/contrib.losses.md b/tensorflow/g3doc/api_docs/python/contrib.losses.md
index 846718e196c..26d297b38f3 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.losses.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.losses.md
@@ -140,6 +140,31 @@ Notice that the function adds the given losses to the regularization losses.
 *  <b>`ValueError`</b>: if `losses` is not iterable.
 
 
+- - -
+
+### `tf.contrib.losses.hinge_loss(logits, target, scope=None)` {#hinge_loss}
+
+Method that returns the loss tensor for hinge loss.
+
+##### Args:
+
+
+*  <b>`logits`</b>: The logits, a float tensor.
+*  <b>`target`</b>: The ground truth output tensor. Its shape should match the shape of
+    logits. The values of the tensor are expected to be 0.0 or 1.0.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+
+##### Returns:
+
+  A `Tensor` of same shape as logits and target representing the loss values
+    across the batch.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the shapes of `logits` and `target` don't match.
+
+
 - - -
 
 ### `tf.contrib.losses.log_loss(predictions, targets, weight=1.0, epsilon=1e-07, scope=None)` {#log_loss}
diff --git a/tensorflow/g3doc/api_docs/python/contrib.rnn.md b/tensorflow/g3doc/api_docs/python/contrib.rnn.md
new file mode 100644
index 00000000000..34277d2b093
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/contrib.rnn.md
@@ -0,0 +1,409 @@
+<!-- This file is machine generated: DO NOT EDIT! -->
+
+# RNN (contrib)
+[TOC]
+
+Additional RNN operations and cells.
+
+## This package provides additional contributed RNNCells.
+
+### Fused RNNCells
+- - -
+
+### `class tf.contrib.rnn.LSTMFusedCell` {#LSTMFusedCell}
+
+Basic LSTM recurrent network cell.
+
+The implementation is based on: http://arxiv.org/abs/1409.2329.
+
+We add forget_bias (default: 1) to the biases of the forget gate in order to
+reduce the scale of forgetting in the beginning of the training.
+
+Unlike BasicLSTMCell, this is a monolithic op and should be much faster. The
+weight and bias matrixes should be compatible as long as the variabel scope
+matches.
+- - -
+
+#### `tf.contrib.rnn.LSTMFusedCell.__init__(num_units, forget_bias=1.0, use_peephole=False)` {#LSTMFusedCell.__init__}
+
+Initialize the basic LSTM cell.
+
+##### Args:
+
+
+*  <b>`num_units`</b>: int, The number of units in the LSTM cell.
+*  <b>`forget_bias`</b>: float, The bias added to forget gates (see above).
+*  <b>`use_peephole`</b>: Whether to use peephole connections or not.
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMFusedCell.output_size` {#LSTMFusedCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMFusedCell.state_size` {#LSTMFusedCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMFusedCell.zero_state(batch_size, dtype)` {#LSTMFusedCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
+
+### LSTM-like cells
+- - -
+
+### `class tf.contrib.rnn.CoupledInputForgetGateLSTMCell` {#CoupledInputForgetGateLSTMCell}
+
+Long short-term memory unit (LSTM) recurrent network cell.
+
+The default non-peephole implementation is based on:
+
+  http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+
+S. Hochreiter and J. Schmidhuber.
+"Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
+
+The peephole implementation is based on:
+
+  https://research.google.com/pubs/archive/43905.pdf
+
+Hasim Sak, Andrew Senior, and Francoise Beaufays.
+"Long short-term memory recurrent neural network architectures for
+ large scale acoustic modeling." INTERSPEECH, 2014.
+
+The coupling of input and forget gate is based on:
+
+  http://arxiv.org/pdf/1503.04069.pdf
+
+Greff et al. "LSTM: A Search Space Odyssey"
+
+The class uses optional peep-hole connections, and an optional projection
+layer.
+- - -
+
+#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.__init__(num_units, use_peepholes=False, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=1, num_proj_shards=1, forget_bias=1.0, state_is_tuple=False, activation=tanh)` {#CoupledInputForgetGateLSTMCell.__init__}
+
+Initialize the parameters for an LSTM cell.
+
+##### Args:
+
+
+*  <b>`num_units`</b>: int, The number of units in the LSTM cell
+*  <b>`use_peepholes`</b>: bool, set True to enable diagonal/peephole connections.
+*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
+    projection matrices.
+*  <b>`num_proj`</b>: (optional) int, The output dimensionality for the projection
+    matrices.  If None, no projection is performed.
+*  <b>`proj_clip`</b>: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
+  provided, then the projected values are clipped elementwise to within
+  `[-proj_clip, proj_clip]`.
+
+*  <b>`num_unit_shards`</b>: How to split the weight matrix.  If >1, the weight
+    matrix is stored across num_unit_shards.
+*  <b>`num_proj_shards`</b>: How to split the projection matrix.  If >1, the
+    projection matrix is stored across num_proj_shards.
+*  <b>`forget_bias`</b>: Biases of the forget gate are initialized by default to 1
+    in order to reduce the scale of forgetting at the beginning of
+    the training.
+*  <b>`state_is_tuple`</b>: If True, accepted and returned states are 2-tuples of
+    the `c_state` and `m_state`.  By default (False), they are concatenated
+    along the column axis.  This default behavior will soon be deprecated.
+*  <b>`activation`</b>: Activation function of the inner states.
+
+
+- - -
+
+#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.output_size` {#CoupledInputForgetGateLSTMCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.state_size` {#CoupledInputForgetGateLSTMCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.zero_state(batch_size, dtype)` {#CoupledInputForgetGateLSTMCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
+- - -
+
+### `class tf.contrib.rnn.TimeFreqLSTMCell` {#TimeFreqLSTMCell}
+
+Time-Frequency Long short-term memory unit (LSTM) recurrent network cell.
+
+This implementation is based on:
+
+  Tara N. Sainath and Bo Li
+  "Modeling Time-Frequency Patterns with LSTM vs. Convolutional Architectures
+  for LVCSR Tasks." submitted to INTERSPEECH, 2016.
+
+It uses peep-hole connections and optional cell clipping.
+- - -
+
+#### `tf.contrib.rnn.TimeFreqLSTMCell.__init__(num_units, use_peepholes=False, cell_clip=None, initializer=None, num_unit_shards=1, forget_bias=1.0, feature_size=None, frequency_skip=None)` {#TimeFreqLSTMCell.__init__}
+
+Initialize the parameters for an LSTM cell.
+
+##### Args:
+
+
+*  <b>`num_units`</b>: int, The number of units in the LSTM cell
+*  <b>`use_peepholes`</b>: bool, set True to enable diagonal/peephole connections.
+*  <b>`cell_clip`</b>: (optional) A float value, if provided the cell state is clipped
+    by this value prior to the cell output activation.
+*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
+    projection matrices.
+*  <b>`num_unit_shards`</b>: int, How to split the weight matrix.  If >1, the weight
+    matrix is stored across num_unit_shards.
+*  <b>`forget_bias`</b>: float, Biases of the forget gate are initialized by default
+    to 1 in order to reduce the scale of forgetting at the beginning
+    of the training.
+*  <b>`feature_size`</b>: int, The size of the input feature the LSTM spans over.
+*  <b>`frequency_skip`</b>: int, The amount the LSTM filter is shifted by in
+    frequency.
+
+
+- - -
+
+#### `tf.contrib.rnn.TimeFreqLSTMCell.output_size` {#TimeFreqLSTMCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.TimeFreqLSTMCell.state_size` {#TimeFreqLSTMCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.TimeFreqLSTMCell.zero_state(batch_size, dtype)` {#TimeFreqLSTMCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
+- - -
+
+### `class tf.contrib.rnn.GridLSTMCell` {#GridLSTMCell}
+
+Grid Long short-term memory unit (LSTM) recurrent network cell.
+
+The default is based on:
+  Nal Kalchbrenner, Ivo Danihelka and Alex Graves
+  "Grid Long Short-Term Memory," Proc. ICLR 2016.
+  http://arxiv.org/abs/1507.01526
+
+When peephole connections are used, the implementation is based on:
+  Tara N. Sainath and Bo Li
+  "Modeling Time-Frequency Patterns with LSTM vs. Convolutional Architectures
+  for LVCSR Tasks." submitted to INTERSPEECH, 2016.
+
+The code uses optional peephole connections, shared_weights and cell clipping.
+- - -
+
+#### `tf.contrib.rnn.GridLSTMCell.__init__(num_units, use_peepholes=False, share_time_frequency_weights=False, cell_clip=None, initializer=None, num_unit_shards=1, forget_bias=1.0, feature_size=None, frequency_skip=None)` {#GridLSTMCell.__init__}
+
+Initialize the parameters for an LSTM cell.
+
+##### Args:
+
+
+*  <b>`num_units`</b>: int, The number of units in the LSTM cell
+*  <b>`use_peepholes`</b>: bool, default False. Set True to enable diagonal/peephole
+    connections.
+*  <b>`share_time_frequency_weights`</b>: bool, default False. Set True to enable
+    shared cell weights between time and frequency LSTMs.
+*  <b>`cell_clip`</b>: (optional) A float value, if provided the cell state is clipped
+    by this value prior to the cell output activation.
+*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
+    projection matrices.
+*  <b>`num_unit_shards`</b>: int, How to split the weight matrix.  If >1, the weight
+    matrix is stored across num_unit_shards.
+*  <b>`forget_bias`</b>: float, Biases of the forget gate are initialized by default
+    to 1 in order to reduce the scale of forgetting at the beginning
+    of the training.
+*  <b>`feature_size`</b>: int, The size of the input feature the LSTM spans over.
+*  <b>`frequency_skip`</b>: int, The amount the LSTM filter is shifted by in
+    frequency.
+
+
+- - -
+
+#### `tf.contrib.rnn.GridLSTMCell.output_size` {#GridLSTMCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.GridLSTMCell.state_size` {#GridLSTMCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.GridLSTMCell.zero_state(batch_size, dtype)` {#GridLSTMCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
+
+### RNNCell wrappers
+- - -
+
+### `class tf.contrib.rnn.AttentionCellWrapper` {#AttentionCellWrapper}
+
+Basic attention cell wrapper.
+
+Implementation based on https://arxiv.org/pdf/1601.06733.pdf.
+- - -
+
+#### `tf.contrib.rnn.AttentionCellWrapper.__init__(cell, attn_length, attn_size=None, attn_vec_size=None, input_size=None, state_is_tuple=False)` {#AttentionCellWrapper.__init__}
+
+Create a cell with attention.
+
+##### Args:
+
+
+*  <b>`cell`</b>: an RNNCell, an attention is added to it.
+*  <b>`attn_length`</b>: integer, the size of an attention window.
+*  <b>`attn_size`</b>: integer, the size of an attention vector. Equal to
+      cell.output_size by default.
+*  <b>`attn_vec_size`</b>: integer, the number of convolutional features calculated
+      on attention state and a size of the hidden layer built from
+      base cell state. Equal attn_size to by default.
+*  <b>`input_size`</b>: integer, the size of a hidden linear layer,
+      built from inputs and attention. Derived from the input tensor
+      by default.
+*  <b>`state_is_tuple`</b>: If True, accepted and returned states are n-tuples, where
+    `n = len(cells)`.  By default (False), the states are all
+    concatenated along the column axis.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if cell is not an RNNCell.
+*  <b>`ValueError`</b>: if cell returns a state tuple but the flag
+      `state_is_tuple` is `False` or if attn_length is zero or less.
+
+
+- - -
+
+#### `tf.contrib.rnn.AttentionCellWrapper.output_size` {#AttentionCellWrapper.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.AttentionCellWrapper.state_size` {#AttentionCellWrapper.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.AttentionCellWrapper.zero_state(batch_size, dtype)` {#AttentionCellWrapper.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
+
diff --git a/tensorflow/g3doc/api_docs/python/control_flow_ops.md b/tensorflow/g3doc/api_docs/python/control_flow_ops.md
index 579633aa3b8..9a92c60b850 100644
--- a/tensorflow/g3doc/api_docs/python/control_flow_ops.md
+++ b/tensorflow/g3doc/api_docs/python/control_flow_ops.md
@@ -361,6 +361,9 @@ to your graph.
 
 Returns the truth value of x AND y element-wise.
 
+*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -396,6 +399,9 @@ Returns the truth value of NOT x element-wise.
 
 Returns the truth value of x OR y element-wise.
 
+*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -427,6 +433,9 @@ operators to your graph.
 
 Returns the truth value of (x == y) element-wise.
 
+*NOTE*: `Equal` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -445,6 +454,9 @@ Returns the truth value of (x == y) element-wise.
 
 Returns the truth value of (x != y) element-wise.
 
+*NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -463,6 +475,9 @@ Returns the truth value of (x != y) element-wise.
 
 Returns the truth value of (x < y) element-wise.
 
+*NOTE*: `Less` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -481,6 +496,9 @@ Returns the truth value of (x < y) element-wise.
 
 Returns the truth value of (x <= y) element-wise.
 
+*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -499,6 +517,9 @@ Returns the truth value of (x <= y) element-wise.
 
 Returns the truth value of (x > y) element-wise.
 
+*NOTE*: `Greater` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -517,6 +538,9 @@ Returns the truth value of (x > y) element-wise.
 
 Returns the truth value of (x >= y) element-wise.
 
+*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/framework.md b/tensorflow/g3doc/api_docs/python/framework.md
index 6108155a0ec..9bc9111fc39 100644
--- a/tensorflow/g3doc/api_docs/python/framework.md
+++ b/tensorflow/g3doc/api_docs/python/framework.md
@@ -1105,7 +1105,10 @@ DEPRECATED: Use outputs.
 
 ### `class tf.Tensor` {#Tensor}
 
-Represents a value produced by an `Operation`.
+Represents one of the outputs of an `Operation`.
+
+*Note:* the `Tensor` class will be replaced by `Output` in the future.
+Currently these two are aliases for each other.
 
 A `Tensor` is a symbolic handle to one of the outputs of an
 `Operation`. It does not hold the values of that operation's output,
diff --git a/tensorflow/g3doc/api_docs/python/functional_ops.md b/tensorflow/g3doc/api_docs/python/functional_ops.md
index 0de41334647..68366b1a83e 100644
--- a/tensorflow/g3doc/api_docs/python/functional_ops.md
+++ b/tensorflow/g3doc/api_docs/python/functional_ops.md
@@ -16,7 +16,7 @@ map-reduce programming patterns.
 
 - - -
 
-### `tf.map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#map_fn}
+### `tf.map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name=None)` {#map_fn}
 
 map on the list of tensors unpacked from `elems` on dimension 0.
 
@@ -58,6 +58,7 @@ nested) tuple of types matching the output of `fn`.
     in parallel.
 *  <b>`back_prop`</b>: (optional) True enables support for back propagation.
 *  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
+*  <b>`infer_shape`</b>: (optional) False disables tests for consistent output shapes.
 *  <b>`name`</b>: (optional) Name prefix for the returned tensors.
 
 ##### Returns:
@@ -191,7 +192,7 @@ of the result tensor is `fn(initializer, values[0]).shape`.
 
 - - -
 
-### `tf.scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#scan}
+### `tf.scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name=None)` {#scan}
 
 scan on the list of tensors unpacked from `elems` on dimension 0.
 
@@ -243,6 +244,7 @@ For example, if `elems` is `(t1, [t2, t3])` and `initializer` is
     in parallel.
 *  <b>`back_prop`</b>: (optional) True enables support for back propagation.
 *  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
+*  <b>`infer_shape`</b>: (optional) False disables tests for consistent output shapes.
 *  <b>`name`</b>: (optional) Name prefix for the returned tensors.
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.VariableScope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.VariableScope.md
index 1b8931d726b..60254402eaa 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.VariableScope.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.VariableScope.md
@@ -13,9 +13,10 @@ Attributes:
   partitioner: callable or `None`: the partitioner passed to `get_variable`.
   custom_getter: default custom getter passed to get_variable.
   name_scope: The name passed to `tf.name_scope`.
+  dtype: default type passed to get_variable (defaults to DT_FLOAT).
 - - -
 
-#### `tf.VariableScope.__init__(reuse, name='', initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, name_scope='')` {#VariableScope.__init__}
+#### `tf.VariableScope.__init__(reuse, name='', initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, name_scope='', dtype=tf.float32)` {#VariableScope.__init__}
 
 Creates a new VariableScope with the given properties.
 
@@ -36,7 +37,14 @@ Creates a new VariableScope with the given properties.
 
 - - -
 
-#### `tf.VariableScope.get_variable(var_store, name, shape=None, dtype=tf.float32, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#VariableScope.get_variable}
+#### `tf.VariableScope.dtype` {#VariableScope.dtype}
+
+
+
+
+- - -
+
+#### `tf.VariableScope.get_variable(var_store, name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#VariableScope.get_variable}
 
 Gets an existing variable with this name or create a new one.
 
@@ -104,6 +112,13 @@ Set caching_device for this scope.
 Set custom getter for this scope.
 
 
+- - -
+
+#### `tf.VariableScope.set_dtype(dtype)` {#VariableScope.set_dtype}
+
+Set data type for this scope.
+
+
 - - -
 
 #### `tf.VariableScope.set_initializer(initializer)` {#VariableScope.set_initializer}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.assert_non_positive.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.assert_non_positive.md
index 83eb36a95cc..ded14160307 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.assert_non_positive.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.assert_non_positive.md
@@ -1,4 +1,4 @@
-### `tf.assert_non_positive(x, data=None, summarize=None, name=None)` {#assert_non_positive}
+### `tf.assert_non_positive(x, data=None, summarize=None, message=None, name=None)` {#assert_non_positive}
 
 Assert the condition `x <= 0` holds element-wise.
 
@@ -25,6 +25,7 @@ If `x` is empty this is trivially satisfied.
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).
     Defaults to "assert_non_positive".
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cholesky.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cholesky.md
index 4032b80d8e0..61e781319d8 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cholesky.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cholesky.md
@@ -1,6 +1,6 @@
 ### `tf.cholesky(input, name=None)` {#cholesky}
 
-Calculates the Cholesky decomposition of a square matrix.
+Computes the Cholesky decomposition of a square matrix.
 
 The input has to be symmetric and positive definite. Only the lower-triangular
 part of the input will be used for this operation. The upper-triangular part
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md
index 1c16241d89a..79adadc72c2 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Bernoulli.md
@@ -2,10 +2,6 @@ Bernoulli distribution.
 
 The Bernoulli distribution is parameterized by p, the probability of a
 positive event.
-
-Note, the following methods of the base class aren't implemented:
-  * cdf
-  * log_cdf
 - - -
 
 #### `tf.contrib.distributions.Bernoulli.__init__(logits=None, p=None, dtype=tf.int32, validate_args=True, allow_nan_stats=False, name='Bernoulli')` {#Bernoulli.__init__}
@@ -25,10 +21,10 @@ Construct Bernoulli distributions.
 *  <b>`dtype`</b>: dtype for samples.
 *  <b>`validate_args`</b>: Whether to assert that `0 <= p <= 1`. If not validate_args,
    `log_pmf` may return nans.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: A name for this distribution.
 
 ##### Raises:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md
index 05da054e766..508fa43b59c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.Dirichlet.md
@@ -68,16 +68,16 @@ Initialize a batch of Dirichlet distributions.
 ##### Args:
 
 
-*  <b>`alpha`</b>: Positive `float` or `double` tensor with shape broadcastable to
+*  <b>`alpha`</b>: Positive floating point tensor with shape broadcastable to
     `[N1,..., Nm, k]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
      different `k` class Dirichlet distributions.
 *  <b>`validate_args`</b>: Whether to assert valid values for parameters `alpha` and
-    `x` in `prob` and `log_prob`.  If False, correct behavior is not
+    `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
     guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 
@@ -233,7 +233,7 @@ Log of the probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float` or `double`, tensor whose shape can
+*  <b>`x`</b>: Non-negative tensor with dtype `dtype` and whose shape can
     be broadcast with `self.alpha`.  For fixed leading dimensions, the last
     dimension represents counts for the corresponding Dirichlet distribution
     in `self.alpha`. `x` is only legal if it sums up to one.
@@ -302,7 +302,7 @@ The probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float`, `double` tensor whose shape can
+*  <b>`x`</b>: Non-negative tensor with dtype `dtype` and whose shape can
     be broadcast with `self.alpha`.  For fixed leading dimensions, the last
     dimension represents x for the corresponding Dirichlet distribution in
     `self.alpha` and `self.beta`. `x` is only legal if it sums up to one.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md
index a249fed523d..82e42910610 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.distributions.MultivariateNormalCholesky.md
@@ -6,14 +6,14 @@ and requires `O(k^2)` storage.
 
 #### Mathematical details
 
-The PDF of this distribution is:
+The Cholesky factor `chol` defines the covariance matrix: `C = chol chol^T`.
+
+The PDF of this distribution is then:
 
 ```
-f(x) = (2*pi)^(-k/2) |det(sigma)|^(-1/2) exp(-1/2*(x-mu)^*.sigma^{-1}.(x-mu))
+f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
 ```
 
-where `.` denotes the inner product on `R^k` and `^*` denotes transpose.
-
 #### Examples
 
 A single multi-variate Gaussian distribution is defined by a vector of means
@@ -51,22 +51,23 @@ Trainable (batch) Choesky matrices can be created with
 Multivariate Normal distributions on `R^k`.
 
 User must provide means `mu` and `chol` which holds the (batch) Cholesky
-factors `S`, such that the covariance of each batch member is `S S^*`.
+factors, such that the covariance of each batch member is `chol chol^T`.
 
 ##### Args:
 
 
-*  <b>`mu`</b>: `(N+1)-D`  `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+*  <b>`mu`</b>: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
     `b >= 0`.
 *  <b>`chol`</b>: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
-    `[N1,...,Nb, k, k]`.
+    `[N1,...,Nb, k, k]`.  The upper triangular part is ignored (treated as
+    though it is zero), and the diagonal must be positive.
 *  <b>`validate_args`</b>: Whether to validate input with asserts.  If `validate_args`
-    is `False`,
-    and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+    is `False`, and the inputs are invalid, correct behavior is not
+    guaranteed.
+*  <b>`allow_nan_stats`</b>: `Boolean`, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
@@ -79,7 +80,7 @@ factors `S`, such that the covariance of each batch member is `S S^*`.
 
 #### `tf.contrib.distributions.MultivariateNormalCholesky.allow_nan_stats` {#MultivariateNormalCholesky.allow_nan_stats}
 
-Boolean describing behavior when a stat is undefined for batch member.
+`Boolean` describing behavior when stats are undefined.
 
 
 - - -
@@ -348,7 +349,7 @@ Standard deviation of the distribution.
 
 #### `tf.contrib.distributions.MultivariateNormalCholesky.validate_args` {#MultivariateNormalCholesky.validate_args}
 
-Boolean describing behavior on invalid input.
+`Boolean` describing behavior on invalid input.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.detach_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.detach_inputs.md
new file mode 100644
index 00000000000..fdf95a1b8f1
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.detach_inputs.md
@@ -0,0 +1,22 @@
+### `tf.contrib.graph_editor.detach_inputs(sgv, control_inputs=False)` {#detach_inputs}
+
+Detach the inputs of a subgraph view.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
+    subgraph using the same rules as the function subgraph.make_view.
+*  <b>`control_inputs`</b>: if True control_inputs are also detached.
+
+##### Returns:
+
+  A new subgraph view of the detached subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.reroute_a2b_outputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.reroute_a2b_outputs.md
new file mode 100644
index 00000000000..0bf41935968
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.reroute_a2b_outputs.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.reroute_a2b_outputs(sgv0, sgv1)` {#reroute_a2b_outputs}
+
+Re-route all the outputs of sgv0 to sgv1 (see _reroute_outputs).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.select_ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.select_ops.md
new file mode 100644
index 00000000000..44660ef243a
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.select_ops.md
@@ -0,0 +1,30 @@
+### `tf.contrib.graph_editor.select_ops(*args, **kwargs)` {#select_ops}
+
+Helper to select operations.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Operation. tf.Tensor instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ops_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ops)".
+
+##### Returns:
+
+  list of tf.Operation
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Operation
+    or an (array of) tf.Tensor (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.swap_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.swap_inputs.md
new file mode 100644
index 00000000000..bd18c89d6b2
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.graph_editor.swap_inputs.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.swap_inputs(sgv0, sgv1)` {#swap_inputs}
+
+Swap all the inputs of sgv0 and sgv1 (see reroute_inputs).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
index 6492f54565b..14aad5b0ccb 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.learn.LinearRegressor.md
@@ -54,9 +54,9 @@ Construct a `LinearRegressor` estimator object.
 *  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
     the model. All items in the set should be instances of classes derived
     from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph, etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph, etc. This can
+    also be used to load checkpoints from the directory into a estimator
+    to continue training a previously saved model.
 *  <b>`weight_column_name`</b>: A string defining feature column name representing
     weights. It is used to down weight or boost examples during training. It
     will be multiplied by the loss of the example.
@@ -101,56 +101,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.LinearRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#LinearRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -164,37 +115,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.LinearRegressor.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#LinearRegressor.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.AttentionCellWrapper.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.AttentionCellWrapper.md
new file mode 100644
index 00000000000..73f35490f75
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.contrib.rnn.AttentionCellWrapper.md
@@ -0,0 +1,70 @@
+Basic attention cell wrapper.
+
+Implementation based on https://arxiv.org/pdf/1601.06733.pdf.
+- - -
+
+#### `tf.contrib.rnn.AttentionCellWrapper.__init__(cell, attn_length, attn_size=None, attn_vec_size=None, input_size=None, state_is_tuple=False)` {#AttentionCellWrapper.__init__}
+
+Create a cell with attention.
+
+##### Args:
+
+
+*  <b>`cell`</b>: an RNNCell, an attention is added to it.
+*  <b>`attn_length`</b>: integer, the size of an attention window.
+*  <b>`attn_size`</b>: integer, the size of an attention vector. Equal to
+      cell.output_size by default.
+*  <b>`attn_vec_size`</b>: integer, the number of convolutional features calculated
+      on attention state and a size of the hidden layer built from
+      base cell state. Equal attn_size to by default.
+*  <b>`input_size`</b>: integer, the size of a hidden linear layer,
+      built from inputs and attention. Derived from the input tensor
+      by default.
+*  <b>`state_is_tuple`</b>: If True, accepted and returned states are n-tuples, where
+    `n = len(cells)`.  By default (False), the states are all
+    concatenated along the column axis.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if cell is not an RNNCell.
+*  <b>`ValueError`</b>: if cell returns a state tuple but the flag
+      `state_is_tuple` is `False` or if attn_length is zero or less.
+
+
+- - -
+
+#### `tf.contrib.rnn.AttentionCellWrapper.output_size` {#AttentionCellWrapper.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.AttentionCellWrapper.state_size` {#AttentionCellWrapper.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.AttentionCellWrapper.zero_state(batch_size, dtype)` {#AttentionCellWrapper.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cumprod.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cumprod.md
index a226ce07373..7381350be38 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cumprod.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.cumprod.md
@@ -2,13 +2,15 @@
 
 Compute the cumulative product of the tensor `x` along `axis`.
 
-By default, this op performs an inclusive cumprod, which means that the first
+By default, this op performs an inclusive cumprod, which means that the
+first
 element of the input is identical to the first element of the output:
 ```prettyprint
 tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
 ```
 
-By setting the `exclusive` kwarg to `True`, an exclusive cumprod is performed
+By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+performed
 instead:
 ```prettyprint
 tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
@@ -30,8 +32,8 @@ tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
 
 
 *  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-   `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-   `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+     `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+     `complex128`, `qint8`, `quint8`, `qint32`, `half`.
 *  <b>`axis`</b>: A `Tensor` of type `int32` (default: 0).
 *  <b>`reverse`</b>: A `bool` (default: False).
 *  <b>`name`</b>: A name for the operation (optional).
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mod.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mod.md
index 5bfe1058a77..86978890b5a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mod.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mod.md
@@ -2,6 +2,9 @@
 
 Returns element-wise remainder of division.
 
+*NOTE*: `Mod` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mul.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mul.md
index 3d6fa568645..2efd16e8915 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mul.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.mul.md
@@ -2,6 +2,9 @@
 
 Returns x * y element-wise.
 
+*NOTE*: `Mul` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.rnn.md
index 19caecfb70a..d9e935f8fb3 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.rnn.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.nn.rnn.md
@@ -2,15 +2,15 @@
 
 Creates a recurrent neural network specified by RNNCell `cell`.
 
-##### The simplest form of RNN network generated is:
-
+The simplest form of RNN network generated is:
+```py
   state = cell.zero_state(...)
   outputs = []
   for input_ in inputs:
     output, state = cell(input_, state)
     outputs.append(output)
   return (outputs, state)
-
+```
 However, a few other options are available:
 
 An initial state can be provided.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.not_equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.not_equal.md
index 9c187922232..5ed8df49d5c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.not_equal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard0/tf.not_equal.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of (x != y) element-wise.
 
+*NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Tensor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Tensor.md
index 73af134a7a5..6925d9d6d7c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Tensor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.Tensor.md
@@ -1,4 +1,7 @@
-Represents a value produced by an `Operation`.
+Represents one of the outputs of an `Operation`.
+
+*Note:* the `Tensor` class will be replaced by `Output` in the future.
+Currently these two are aliases for each other.
 
 A `Tensor` is a symbolic handle to one of the outputs of an
 `Operation`. It does not hold the values of that operation's output,
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_less_equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_less_equal.md
index d740746a615..a37950c4929 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_less_equal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_less_equal.md
@@ -1,4 +1,4 @@
-### `tf.assert_less_equal(x, y, data=None, summarize=None, name=None)` {#assert_less_equal}
+### `tf.assert_less_equal(x, y, data=None, summarize=None, message=None, name=None)` {#assert_less_equal}
 
 Assert the condition `x <= y` holds element-wise.
 
@@ -27,6 +27,7 @@ If both `x` and `y` are empty, this is trivially satisfied.
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`, `y`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_less_equal"
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_rank_at_least.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_rank_at_least.md
index 1b33f3401b3..8936468be23 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_rank_at_least.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.assert_rank_at_least.md
@@ -1,4 +1,4 @@
-### `tf.assert_rank_at_least(x, rank, data=None, summarize=None, name=None)` {#assert_rank_at_least}
+### `tf.assert_rank_at_least(x, rank, data=None, summarize=None, message=None, name=None)` {#assert_rank_at_least}
 
 Assert `x` has rank equal to `rank` or higher.
 
@@ -23,12 +23,14 @@ x = tf.with_dependencies([tf.assert_rank_at_least(x, 2)], x)
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).
     Defaults to "assert_rank_at_least".
 
 ##### Returns:
 
   Op raising `InvalidArgumentError` unless `x` has specified rank or higher.
+  If static checks determine `x` has correct rank, a `no_op` is returned.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_matrix_inverse.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_matrix_inverse.md
index 231056a05c2..6b51df6aec7 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_matrix_inverse.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_matrix_inverse.md
@@ -1,6 +1,6 @@
 ### `tf.batch_matrix_inverse(input, adjoint=None, name=None)` {#batch_matrix_inverse}
 
-Calculates the inverse of square invertible matrices or their adjoints
+Computes the inverse of square invertible matrices or their adjoints
 
 (conjugate transposes).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_self_adjoint_eigvals.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_self_adjoint_eigvals.md
new file mode 100644
index 00000000000..77cdaf3ec3c
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.batch_self_adjoint_eigvals.md
@@ -0,0 +1,16 @@
+### `tf.batch_self_adjoint_eigvals(tensor, name=None)` {#batch_self_adjoint_eigvals}
+
+Computes the eigenvalues of a batch of self-adjoint matrices.
+
+##### Args:
+
+
+*  <b>`tensor`</b>: `Tensor` of shape `[..., N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`e`</b>: Eigenvalues. Shape is `[..., N]`. The vector `e[..., :]` contains the `N`
+    eigenvalues of `tensor[..., :, :]`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md
index a599c0918d5..8d26e98d154 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.MultivariateNormalDiag.md
@@ -13,7 +13,7 @@ The PDF of this distribution is defined in terms of the diagonal covariance
 determined by `diag_stdev`: `C_{ii} = diag_stdev[i]**2`.
 
 ```
-f(x) = (2*pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 * (x - mu)^T C^{-1} (x - mu))
+f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
 ```
 
 #### Examples
@@ -56,17 +56,17 @@ The mean of `X_i` is `mu[i]`, and the standard deviation is `diag_stdev[i]`.
 ##### Args:
 
 
-*  <b>`mu`</b>: Rank `N + 1` `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+*  <b>`mu`</b>: Rank `N + 1` floating point tensor with shape `[N1,...,Nb, k]`,
     `b >= 0`.
 *  <b>`diag_stdev`</b>: Rank `N + 1` `Tensor` with same `dtype` and shape as `mu`,
-    representing the standard deviations.
+    representing the standard deviations.  Must be positive.
 *  <b>`validate_args`</b>: Whether to validate input with asserts.  If `validate_args`
     is `False`,
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: `Boolean`, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
@@ -79,7 +79,7 @@ The mean of `X_i` is `mu[i]`, and the standard deviation is `diag_stdev[i]`.
 
 #### `tf.contrib.distributions.MultivariateNormalDiag.allow_nan_stats` {#MultivariateNormalDiag.allow_nan_stats}
 
-Boolean describing behavior when a stat is undefined for batch member.
+`Boolean` describing behavior when stats are undefined.
 
 
 - - -
@@ -348,7 +348,7 @@ Standard deviation of the distribution.
 
 #### `tf.contrib.distributions.MultivariateNormalDiag.validate_args` {#MultivariateNormalDiag.validate_args}
 
-Boolean describing behavior on invalid input.
+`Boolean` describing behavior on invalid input.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md
index 0b00a17938d..c43058d8870 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.distributions.StudentT.md
@@ -57,19 +57,19 @@ broadcasting (e.g. `df + mu + sigma` is a valid operation).
 ##### Args:
 
 
-*  <b>`df`</b>: `float` or `double` tensor, the degrees of freedom of the
+*  <b>`df`</b>: Floating point tensor, the degrees of freedom of the
     distribution(s). `df` must contain only positive values.
-*  <b>`mu`</b>: `float` or `double` tensor, the means of the distribution(s).
-*  <b>`sigma`</b>: `float` or `double` tensor, the scaling factor for the
+*  <b>`mu`</b>: Floating point tensor, the means of the distribution(s).
+*  <b>`sigma`</b>: Floating point tensor, the scaling factor for the
     distribution(s). `sigma` must contain only positive values.
     Note that `sigma` is not the standard deviation of this distribution.
 *  <b>`validate_args`</b>: Whether to assert that `df > 0, sigma > 0`. If
-    `validate_args` is False and inputs are invalid, correct behavior is not
-    guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+    `validate_args` is `False` and inputs are invalid, correct behavior is
+    not guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.ops.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.ops.md
new file mode 100644
index 00000000000..d579ac9a46e
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.ops.md
@@ -0,0 +1,30 @@
+### `tf.contrib.graph_editor.ops(*args, **kwargs)` {#ops}
+
+Helper to select operations.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Operation. tf.Tensor instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ops_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ops)".
+
+##### Returns:
+
+  list of tf.Operation
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Operation
+    or an (array of) tf.Tensor (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.reroute_a2b_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.reroute_a2b_inputs.md
new file mode 100644
index 00000000000..0f82675ef90
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.graph_editor.reroute_a2b_inputs.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.reroute_a2b_inputs(sgv0, sgv1)` {#reroute_a2b_inputs}
+
+Re-route all the inputs of sgv0 to sgv1 (see reroute_inputs).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
index eb2c56ad076..c3cbf1d862c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.learn.LinearClassifier.md
@@ -71,9 +71,9 @@ Construct a `LinearClassifier` estimator object.
 *  <b>`feature_columns`</b>: An iterable containing all the feature columns used by
     the model. All items in the set should be instances of classes derived
     from `FeatureColumn`.
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
+    also be used to load checkpoints from the directory into a estimator
+    to continue training a previously saved model.
 *  <b>`n_classes`</b>: number of target classes. Default is binary classification.
 *  <b>`weight_column_name`</b>: A string defining feature column name representing
     weights. It is used to down weight or boost examples during training. It
@@ -119,56 +119,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.LinearClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#LinearClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -182,37 +133,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.LinearClassifier.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#LinearClassifier.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.hinge_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.hinge_loss.md
new file mode 100644
index 00000000000..57758e07104
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.losses.hinge_loss.md
@@ -0,0 +1,22 @@
+### `tf.contrib.losses.hinge_loss(logits, target, scope=None)` {#hinge_loss}
+
+Method that returns the loss tensor for hinge loss.
+
+##### Args:
+
+
+*  <b>`logits`</b>: The logits, a float tensor.
+*  <b>`target`</b>: The ground truth output tensor. Its shape should match the shape of
+    logits. The values of the tensor are expected to be 0.0 or 1.0.
+*  <b>`scope`</b>: The scope for the operations performed in computing the loss.
+
+##### Returns:
+
+  A `Tensor` of same shape as logits and target representing the loss values
+    across the batch.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If the shapes of `logits` and `target` don't match.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.GridLSTMCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.GridLSTMCell.md
new file mode 100644
index 00000000000..509f59748cd
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.contrib.rnn.GridLSTMCell.md
@@ -0,0 +1,77 @@
+Grid Long short-term memory unit (LSTM) recurrent network cell.
+
+The default is based on:
+  Nal Kalchbrenner, Ivo Danihelka and Alex Graves
+  "Grid Long Short-Term Memory," Proc. ICLR 2016.
+  http://arxiv.org/abs/1507.01526
+
+When peephole connections are used, the implementation is based on:
+  Tara N. Sainath and Bo Li
+  "Modeling Time-Frequency Patterns with LSTM vs. Convolutional Architectures
+  for LVCSR Tasks." submitted to INTERSPEECH, 2016.
+
+The code uses optional peephole connections, shared_weights and cell clipping.
+- - -
+
+#### `tf.contrib.rnn.GridLSTMCell.__init__(num_units, use_peepholes=False, share_time_frequency_weights=False, cell_clip=None, initializer=None, num_unit_shards=1, forget_bias=1.0, feature_size=None, frequency_skip=None)` {#GridLSTMCell.__init__}
+
+Initialize the parameters for an LSTM cell.
+
+##### Args:
+
+
+*  <b>`num_units`</b>: int, The number of units in the LSTM cell
+*  <b>`use_peepholes`</b>: bool, default False. Set True to enable diagonal/peephole
+    connections.
+*  <b>`share_time_frequency_weights`</b>: bool, default False. Set True to enable
+    shared cell weights between time and frequency LSTMs.
+*  <b>`cell_clip`</b>: (optional) A float value, if provided the cell state is clipped
+    by this value prior to the cell output activation.
+*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
+    projection matrices.
+*  <b>`num_unit_shards`</b>: int, How to split the weight matrix.  If >1, the weight
+    matrix is stored across num_unit_shards.
+*  <b>`forget_bias`</b>: float, Biases of the forget gate are initialized by default
+    to 1 in order to reduce the scale of forgetting at the beginning
+    of the training.
+*  <b>`feature_size`</b>: int, The size of the input feature the LSTM spans over.
+*  <b>`frequency_skip`</b>: int, The amount the LSTM filter is shifted by in
+    frequency.
+
+
+- - -
+
+#### `tf.contrib.rnn.GridLSTMCell.output_size` {#GridLSTMCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.GridLSTMCell.state_size` {#GridLSTMCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.GridLSTMCell.zero_state(batch_size, dtype)` {#GridLSTMCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.greater_equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.greater_equal.md
index 9d68429c36c..d6ce057c133 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.greater_equal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.greater_equal.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of (x >= y) element-wise.
 
+*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md
index 40a4332531b..dd98fd9dd8a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard1/tf.map_fn.md
@@ -1,4 +1,4 @@
-### `tf.map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#map_fn}
+### `tf.map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name=None)` {#map_fn}
 
 map on the list of tensors unpacked from `elems` on dimension 0.
 
@@ -40,6 +40,7 @@ nested) tuple of types matching the output of `fn`.
     in parallel.
 *  <b>`back_prop`</b>: (optional) True enables support for back propagation.
 *  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
+*  <b>`infer_shape`</b>: (optional) False disables tests for consistent output shapes.
 *  <b>`name`</b>: (optional) Name prefix for the returned tensors.
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.InteractiveSession.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.InteractiveSession.md
index cdb5101815d..308a0a80b49 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.InteractiveSession.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.InteractiveSession.md
@@ -53,8 +53,7 @@ the session constructor.
 
 
 *  <b>`target`</b>: (Optional.) The execution engine to connect to.
-    Defaults to using an in-process engine. At present, no value
-    other than the empty string is supported.
+    Defaults to using an in-process engine.
 *  <b>`graph`</b>: (Optional.) The `Graph` to be launched (described above).
 *  <b>`config`</b>: (Optional) `ConfigProto` proto used to configure the session.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.assert_non_negative.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.assert_non_negative.md
index 47f07a698a8..4c0035b75cf 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.assert_non_negative.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.assert_non_negative.md
@@ -1,4 +1,4 @@
-### `tf.assert_non_negative(x, data=None, summarize=None, name=None)` {#assert_non_negative}
+### `tf.assert_non_negative(x, data=None, summarize=None, message=None, name=None)` {#assert_non_negative}
 
 Assert the condition `x >= 0` holds element-wise.
 
@@ -25,6 +25,7 @@ If `x` is empty this is trivially satisfied.
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).
     Defaults to "assert_non_negative".
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.batch_matrix_determinant.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.batch_matrix_determinant.md
index d55bf96f187..a30b74e35cc 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.batch_matrix_determinant.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.batch_matrix_determinant.md
@@ -1,6 +1,6 @@
 ### `tf.batch_matrix_determinant(input, name=None)` {#batch_matrix_determinant}
 
-Calculates the determinants for a batch of square matrices.
+Computes the determinants for a batch of square matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices. The output is a tensor containing the determinants
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md
index 052af1eb55e..a207a1112ec 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Categorical.md
@@ -2,11 +2,6 @@ Categorical distribution.
 
 The categorical distribution is parameterized by the log-probabilities
 of a set of classes.
-
-Note, the following methods of the base class aren't implemented:
-  * mean
-  * cdf
-  * log_cdf
 - - -
 
 #### `tf.contrib.distributions.Categorical.__init__(logits, dtype=tf.int32, validate_args=True, allow_nan_stats=False, name='Categorical')` {#Categorical.__init__}
@@ -22,10 +17,10 @@ Initialize Categorical distributions using class log-probabilities.
       indexes into the classes.
 *  <b>`dtype`</b>: The type of the event samples (default: int32).
 *  <b>`validate_args`</b>: Unused in this distribution.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: A name for this distribution (optional).
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md
index 2f692a15f9c..f01b075d05a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Chi2.md
@@ -15,15 +15,15 @@ Construct Chi2 distributions with parameter `df`.
 ##### Args:
 
 
-*  <b>`df`</b>: `float` or `double` tensor, the degrees of freedom of the
+*  <b>`df`</b>: Floating point tensor, the degrees of freedom of the
     distribution(s).  `df` must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `df > 0`, and that `x > 0` in the
-    methods `prob(x)` and `log_prob(x)`. If `validate_args` is False
+    methods `prob(x)` and `log_prob(x)`. If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md
index 9862309eed4..9eea17257d9 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.distributions.Uniform.md
@@ -31,14 +31,14 @@ u1 = Uniform(3.0, [5.0, 6.0, 7.0])  # 3 distributions
 ##### Args:
 
 
-*  <b>`a`</b>: `float` or `double` tensor, the minimum endpoint.
-*  <b>`b`</b>: `float` or `double` tensor, the maximum endpoint. Must be > `a`.
-*  <b>`validate_args`</b>: Whether to assert that `a > b`. If `validate_args` is False
-    and inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`a`</b>: Floating point tensor, the minimum endpoint.
+*  <b>`b`</b>: Floating point tensor, the maximum endpoint. Must be > `a`.
+*  <b>`validate_args`</b>: Whether to assert that `a > b`. If `validate_args` is
+    `False` and inputs are invalid, correct behavior is not guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 ##### Raises:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.deprecated_arg_values.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.deprecated_arg_values.md
new file mode 100644
index 00000000000..285ea14f96e
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.framework.deprecated_arg_values.md
@@ -0,0 +1,35 @@
+### `tf.contrib.framework.deprecated_arg_values(date, instructions, **deprecated_kwargs)` {#deprecated_arg_values}
+
+Decorator for marking specific function argument values as deprecated.
+
+This decorator logs a deprecation warning whenever the decorated function is
+called with the deprecated argument values. It has the following format:
+
+  Calling <function> (from <module>) with <arg>=<value> is deprecated and
+  will be removed after <date>. Instructions for updating:
+    <instructions>
+
+<function> will include the class name if it is a method.
+
+It also edits the docstring of the function: ' (deprecated arguments)' is
+appended to the first line of the docstring and a deprecation notice is
+prepended to the rest of the docstring.
+
+##### Args:
+
+
+*  <b>`date`</b>: String. The date the function is scheduled to be removed. Must be
+    ISO 8601 (YYYY-MM-DD).
+*  <b>`instructions`</b>: String. Instructions on how to update code using the
+    deprecated function.
+*  <b>`**deprecated_kwargs`</b>: The deprecated argument values.
+
+##### Returns:
+
+  Decorated function or method.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If date is not in ISO 8601 format, or instructions are empty.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.SubGraphView.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.SubGraphView.md
new file mode 100644
index 00000000000..bf2ecc56456
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.SubGraphView.md
@@ -0,0 +1,391 @@
+A subgraph view on an existing tf.Graph.
+
+An instance of this class is a subgraph view on an existing tf.Graph.
+"subgraph" means that it can represent part of the whole tf.Graph.
+"view" means that it only provides a passive observation and do not to act
+on the tf.Graph. Note that in this documentation, the term "subgraph" is often
+used as substitute to "subgraph view".
+
+A subgraph contains:
+- a list of input tensors, accessible via the "inputs" property.
+- a list of output tensors, accessible via the "outputs" property.
+- and the operations in between, accessible via the "ops" property.
+
+An subgraph can be seen as a function F(i0, i1, ...) -> o0, o1, ... It is a
+function which takes as input some input tensors and returns as output some
+output tensors. The computation that the function performs is encoded in the
+operations of the subgraph.
+
+The tensors (input or output) can be of two kinds:
+- connected: a connected tensor connects to at least one operation contained
+in the subgraph. One example is a subgraph representing a single operation
+and its inputs and outputs: all the input and output tensors of the op
+are "connected".
+- passthrough: a passthrough tensor does not connect to any operation
+contained in the subgraph. One example is a subgraph representing a
+single tensor: this tensor is passthrough. By default a passthrough tensor is
+present both in the input and output tensors of the subgraph. It can however
+be remapped to only appear as an input (or output) only.
+
+The input and output tensors can be remapped. For instance, some input tensor
+can be ommited. For instance, a subgraph representing an operation with two
+inputs can be remapped to only take one input. Note that this does not change
+at all the underlying tf.Graph (remember, it is a view). It means that
+the other input is being ignored, or is being treated as "given".
+The analogy with functions can be extended like this: F(x,y) is the original
+function. Remapping the inputs from [x, y] to just [x] means that the subgraph
+now represent the function F_y(x) (y is "given").
+
+The output tensors can also be remapped. For instance, some output tensor can
+be ommited. Other output tensor can be duplicated as well. As mentioned
+before, this does not change at all the underlying tf.Graph.
+The analogy with functions can be extended like this: F(...)->x,y is the
+original function. Remapping the outputs from [x, y] to just [y,y] means that
+the subgraph now represent the function M(F(...)) where M is the function
+M(a,b)->b,b.
+
+It is useful to describe three other kind of tensors:
+- internal: an internal tensor is a tensor connecting operations contained
+in the subgraph. One example in the subgraph representing the two operations
+A and B connected sequentially: -> A -> B ->. The middle arrow is an internal
+tensor.
+- actual input: an input tensor of the subgraph, regardless of whether it is
+  listed in "inputs" or not (masked-out).
+- actual output: an output tensor of the subgraph, regardless of whether it is
+  listed in "outputs" or not (masked-out).
+- hidden input: an actual input which has been masked-out using an
+  input remapping. In other word, a hidden input is a non-internal tensor
+  not listed as a input tensor and one of whose consumers belongs to
+  the subgraph.
+- hidden output: a actual output which has been masked-out using an output
+  remapping. In other word, a hidden output is a non-internal tensor
+  not listed as an output and one of whose generating operations belongs to
+  the subgraph.
+
+Here are some usefull guarantees about an instance of a SubGraphView:
+- the input (or output) tensors are not internal.
+- the input (or output) tensors are either "connected" or "passthrough".
+- the passthrough tensors are not connected to any of the operation of
+the subgraph.
+
+Note that there is no guarantee that an operation in a subgraph contributes
+at all to its inputs or outputs. For instance, remapping both the inputs and
+outputs to empty lists will produce a subgraph which still contains all the
+original operations. However, the remove_unused_ops function can be used to
+make a new subgraph view whose operations are connected to at least one of
+the input or output tensors.
+
+An instance of this class is meant to be a lightweight object which is not
+modified in-place by the user. Rather, the user can create new modified
+instances of a given subgraph. In that sense, the class SubGraphView is meant
+to be used like an immutable python object.
+
+A common problem when using views is that they can get out-of-sync with the
+data they observe (in this case, a tf.Graph). This is up to the user to insure
+that this doesn't happen. To keep on the safe sife, it is recommended that
+the life time of subgraph views are kept very short. One way to achieve this
+is to use subgraphs within a "with make_sgv(...) as sgv:" Python context.
+
+To alleviate the out-of-sync problem, some functions are granted the right to
+modified subgraph in place. This is typically the case of graph manipulation
+functions which, given some subgraphs as arguments, can modify the underlying
+tf.Graph. Since this modification is likely to render the subgraph view
+invalid, those functions can modify the argument in place to reflect the
+change. For instance, calling the function swap_inputs(svg0, svg1) will modify
+svg0 and svg1 in place to reflect the fact that their inputs have now being
+swapped.
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.__init__(inside_ops=(), passthrough_ts=())` {#SubGraphView.__init__}
+
+Create a subgraph containing the given ops and the "passthrough" tensors.
+
+##### Args:
+
+
+*  <b>`inside_ops`</b>: an object convertible to a list of tf.Operation. This list
+    defines all the operations in the subgraph.
+*  <b>`passthrough_ts`</b>: an object convertible to a list of tf.Tensor. This list
+    define all the "passthrough" tensors. A passthrough tensor is a tensor
+    which goes directly from the input of the subgraph to it output, without
+    any intermediate operations. All the non passthrough tensors are
+    silently ignored.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if inside_ops cannot be converted to a list of tf.Operation or
+    if passthrough_ts cannot be converted to a list of tf.Tensor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.connected_inputs` {#SubGraphView.connected_inputs}
+
+The connected input tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.connected_outputs` {#SubGraphView.connected_outputs}
+
+The connected output tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.consumers()` {#SubGraphView.consumers}
+
+Return a Python set of all the consumers of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.copy()` {#SubGraphView.copy}
+
+Return a copy of itself.
+
+Note that this class is a "view", copying it only create another view and
+does not copy the underlying part of the tf.Graph.
+
+##### Returns:
+
+  a new instance identical to the original one.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.find_op_by_name(op_name)` {#SubGraphView.find_op_by_name}
+
+Return the op named op_name.
+
+##### Args:
+
+
+*  <b>`op_name`</b>: the name to search for
+
+##### Returns:
+
+  The op named op_name.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if the op_name could not be found.
+*  <b>`AssertionError`</b>: if the name was found multiple time.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.graph` {#SubGraphView.graph}
+
+The underlying tf.Graph.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.input_index(t)` {#SubGraphView.input_index}
+
+Find the input index corresponding to the given input tensor t.
+
+##### Args:
+
+
+*  <b>`t`</b>: the input tensor of this subgraph view.
+
+##### Returns:
+
+  the index in the self.inputs list.
+
+##### Raises:
+
+
+*  <b>`Error`</b>: if t in not an input tensor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.inputs` {#SubGraphView.inputs}
+
+The input tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.is_passthrough(t)` {#SubGraphView.is_passthrough}
+
+Check whether a tensor is passthrough.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.op(op_id)` {#SubGraphView.op}
+
+Get an op by its index.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.ops` {#SubGraphView.ops}
+
+The operations in this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.output_index(t)` {#SubGraphView.output_index}
+
+Find the output index corresponding to given output tensor t.
+
+##### Args:
+
+
+*  <b>`t`</b>: the output tensor of this subgraph view.
+
+##### Returns:
+
+  the index in the self.outputs list.
+
+##### Raises:
+
+
+*  <b>`Error`</b>: if t in not an output tensor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.outputs` {#SubGraphView.outputs}
+
+The output tensors of this subgraph view.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.passthroughs` {#SubGraphView.passthroughs}
+
+The passthrough tensors, going straight from input to output.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap(new_input_indices=None, new_output_indices=None)` {#SubGraphView.remap}
+
+Remap the inputs and outputs of the subgraph.
+
+Note that this is only modifying the view: the underlying tf.Graph is not
+affected.
+
+##### Args:
+
+
+*  <b>`new_input_indices`</b>: an iterable of integers representing a mapping between
+    the old inputs and the new ones. This mapping can be under-complete and
+    must be without repetitions.
+*  <b>`new_output_indices`</b>: an iterable of integers representing a mapping between
+    the old outputs and the new ones. This mapping can be under-complete and
+    can have repetitions.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with remapped
+    inputs and outputs.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_default(remove_input_map=True, remove_output_map=True)` {#SubGraphView.remap_default}
+
+Remap the inputs and/or outputs to the default mapping.
+
+##### Args:
+
+
+*  <b>`remove_input_map`</b>: if True the input map is reset to the default one.
+*  <b>`remove_output_map`</b>: if True the output map is reset to the default one.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with its
+    input and/or output mapping reset to the default one.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_inputs(new_input_indices)` {#SubGraphView.remap_inputs}
+
+Remap the inputs of the subgraph.
+
+If the inputs of the original subgraph are [t0, t1, t2], remapping to [2,0]
+will create a new instance whose inputs is [t2, t0].
+
+Note that this is only modifying the view: the underlying tf.Graph is not
+affected.
+
+##### Args:
+
+
+*  <b>`new_input_indices`</b>: an iterable of integers representing a mapping between
+    the old inputs and the new ones. This mapping can be under-complete and
+    must be without repetitions.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with remapped
+    inputs.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_outputs(new_output_indices)` {#SubGraphView.remap_outputs}
+
+Remap the output of the subgraph.
+
+If the output of the original subgraph are [t0, t1, t2], remapping to
+[1,1,0] will create a new instance whose outputs is [t1, t1, t0].
+
+Note that this is only modifying the view: the underlying tf.Graph is not
+affected.
+
+##### Args:
+
+
+*  <b>`new_output_indices`</b>: an iterable of integers representing a mapping between
+    the old outputs and the new ones. This mapping can be under-complete and
+    can have repetitions.
+
+##### Returns:
+
+  A new modified instance of the original subgraph view with remapped
+    outputs.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_outputs_make_unique()` {#SubGraphView.remap_outputs_make_unique}
+
+Remap the outputs so that all the tensors appears only once.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remap_outputs_to_consumers()` {#SubGraphView.remap_outputs_to_consumers}
+
+Remap the outputs to match the number of consumers.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.SubGraphView.remove_unused_ops(control_inputs=True)` {#SubGraphView.remove_unused_ops}
+
+Remove unused ops.
+
+##### Args:
+
+
+*  <b>`control_inputs`</b>: if True, control inputs are used to detect used ops.
+
+##### Returns:
+
+  A new subgraph view which only contains used operations.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.copy.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.copy.md
new file mode 100644
index 00000000000..0d3ac62e34e
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.graph_editor.copy.md
@@ -0,0 +1,24 @@
+### `tf.contrib.graph_editor.copy(sgv, dst_graph=None, dst_scope='', src_scope='')` {#copy}
+
+Copy a subgraph.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the source subgraph-view. This argument is converted to a subgraph
+    using the same rules than the function subgraph.make_view.
+*  <b>`dst_graph`</b>: the destination graph.
+*  <b>`dst_scope`</b>: the destination scope.
+*  <b>`src_scope`</b>: the source scope.
+
+##### Returns:
+
+  the subgraph view of the copied subgraph.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if dst_graph is not a tf.Graph.
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.optimize_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.optimize_loss.md
index 8e6072532be..de8aa8d384a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.optimize_loss.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.layers.optimize_loss.md
@@ -1,4 +1,4 @@
-### `tf.contrib.layers.optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, moving_average_decay=0.9, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None)` {#optimize_loss}
+### `tf.contrib.layers.optimize_loss(loss, global_step, learning_rate, optimizer, gradient_noise_scale=None, gradient_multipliers=None, clip_gradients=None, moving_average_decay=None, learning_rate_decay_fn=None, update_ops=None, variables=None, name=None, summaries=None)` {#optimize_loss}
 
 Given loss and parameters for optimizer, returns a training op.
 
@@ -21,8 +21,8 @@ Given loss and parameters for optimizer, returns a training op.
                         If present, gradients for specified
                         variables will be multiplied by given constant.
 *  <b>`clip_gradients`</b>: float or `None`, clips gradients by this value.
-*  <b>`moving_average_decay`</b>: float or None, takes into account previous loss
-                        to make learning smoother due to outliers.
+*  <b>`moving_average_decay`</b>: Deprecated. float or None, takes into account previous
+                        loss to make learning smoother due to outliers.
 *  <b>`learning_rate_decay_fn`</b>: function, takes `learning_rate` and `global_step`
                           `Tensor`s, returns `Tensor`.
                           Can be used to implement any learning rate decay
@@ -33,6 +33,9 @@ Given loss and parameters for optimizer, returns a training op.
 *  <b>`variables`</b>: list of variables to optimize or
              `None` to use all trainable variables.
 *  <b>`name`</b>: The name for this operation is used to scope operations and summaries.
+*  <b>`summaries`</b>: List of internal quantities to visualize on tensorboard. If not
+             set only the loss and the learning rate will be reported. The
+             complete list is in OPTIMIZER_SUMMARIES.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md
index ba7fd7805d5..ca3154b76f5 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.BaseEstimator.md
@@ -16,9 +16,9 @@ Initializes a BaseEstimator instance.
 ##### Args:
 
 
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
+    also be used to load checkpoints from the directory into a estimator to
+    continue training a previously saved model.
 *  <b>`config`</b>: A RunConfig instance.
 
 
@@ -26,56 +26,7 @@ Initializes a BaseEstimator instance.
 
 #### `tf.contrib.learn.BaseEstimator.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#BaseEstimator.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -89,37 +40,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.BaseEstimator.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#BaseEstimator.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.TensorFlowDNNRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.TensorFlowDNNRegressor.md
index ad2b7626ebe..f13720e198d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.TensorFlowDNNRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.contrib.learn.TensorFlowDNNRegressor.md
@@ -31,56 +31,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowDNNRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowDNNRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.minimum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.minimum.md
index bff13483f4d..9bcd03f6e78 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.minimum.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.minimum.md
@@ -1,6 +1,9 @@
 ### `tf.minimum(x, y, name=None)` {#minimum}
 
-Returns the min of x and y (i.e. x < y ? x : y) element-wise, broadcasts.
+Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+
+*NOTE*: `Minimum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.assert_rank.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.assert_rank.md
index e8da009641e..a4a782b24e5 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.assert_rank.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.assert_rank.md
@@ -1,4 +1,4 @@
-### `tf.assert_rank(x, rank, data=None, summarize=None, name=None)` {#assert_rank}
+### `tf.assert_rank(x, rank, data=None, summarize=None, message=None, name=None)` {#assert_rank}
 
 Assert `x` has rank equal to `rank`.
 
@@ -23,11 +23,13 @@ x = tf.with_dependencies([tf.assert_rank(x, 2)], x)
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_rank".
 
 ##### Returns:
 
   Op raising `InvalidArgumentError` unless `x` has specified rank.
+  If static checks determine `x` has correct rank, a `no_op` is returned.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.assert_type.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.assert_type.md
index e98b9dc4afb..80f0ac6a099 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.assert_type.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.assert_type.md
@@ -1,15 +1,21 @@
-### `tf.assert_type(tensor, tf_type)` {#assert_type}
+### `tf.assert_type(tensor, tf_type, message=None, name=None)` {#assert_type}
 
-Asserts that the given `Tensor` is of the specified type.
+Statically asserts that the given `Tensor` is of the specified type.
 
 ##### Args:
 
 
 *  <b>`tensor`</b>: A tensorflow `Tensor`.
 *  <b>`tf_type`</b>: A tensorflow type (dtypes.float32, tf.int64, dtypes.bool, etc).
+*  <b>`message`</b>: A string to prefix to the default message.
+*  <b>`name`</b>: A name to give this `Op`.  Defaults to "assert_type"
 
 ##### Raises:
 
 
-*  <b>`ValueError`</b>: If the tensors data type doesn't match tf_type.
+*  <b>`TypeError`</b>: If the tensors data type doesn't match tf_type.
+
+##### Returns:
+
+  A `no_op` that does nothing.  Type can be determined statically.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.batch_self_adjoint_eig.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.batch_self_adjoint_eig.md
index 19d6c5319f0..fe05ec127a2 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.batch_self_adjoint_eig.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.batch_self_adjoint_eig.md
@@ -1,22 +1,22 @@
-### `tf.batch_self_adjoint_eig(input, name=None)` {#batch_self_adjoint_eig}
+### `tf.batch_self_adjoint_eig(tensor, name=None)` {#batch_self_adjoint_eig}
 
-Calculates the Eigen Decomposition of a batch of square self-adjoint matrices.
+Computes the eigen decomposition of a batch of self-adjoint matrices.
 
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices, with the same constraints as the single matrix
-SelfAdjointEig.
-
-The result is a '[..., M+1, M] matrix with [..., 0,:] containing the
-eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
+Computes the eigenvalues and eigenvectors of the innermost N-by-N matrices
+in `tensor` such that
+`tensor[...,:,:] * v[..., :,i] = e(..., i) * v[...,:,i]`, for i=0...N-1.
 
 ##### Args:
 
 
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`.
-    Shape is `[..., M, M]`.
-*  <b>`name`</b>: A name for the operation (optional).
+*  <b>`tensor`</b>: `Tensor` of shape `[..., N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
 
 ##### Returns:
 
-  A `Tensor`. Has the same type as `input`. Shape is `[..., M+1, M]`.
+
+*  <b>`e`</b>: Eigenvalues. Shape is `[..., N]`.
+*  <b>`v`</b>: Eigenvectors. Shape is `[..., N, N]`. The columns of the inner most
+  matrices
+    contain eigenvectors of the corresponding matrices in `tensor`
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Binomial.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Binomial.md
new file mode 100644
index 00000000000..96d194944e1
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Binomial.md
@@ -0,0 +1,401 @@
+Binomial distribution.
+
+This distribution is parameterized by a vector `p` of probabilities and `n`,
+the total counts.
+
+#### Mathematical details
+
+The Binomial is a distribution over the number of successes in `n` independent
+trials, with each trial having the same probability of success `p`.
+The probability mass function (pmf):
+
+```pmf(k) = n! / (k! * (n - k)!) * (p)^k * (1 - p)^(n - k)```
+
+#### Examples
+
+Create a single distribution, corresponding to 5 coin flips.
+
+```python
+dist = Binomial(n=5., p=.5)
+```
+
+Create a single distribution (using logits), corresponding to 5 coin flips.
+
+```python
+dist = Binomial(n=5., logits=0.)
+```
+
+Creates 3 distributions with the third distribution most likely to have
+successes.
+
+```python
+p = [.2, .3, .8]
+# n will be broadcast to [4., 4., 4.], to match p.
+dist = Binomial(n=4., p=p)
+```
+
+The distribution functions can be evaluated on counts.
+
+```python
+# counts same shape as p.
+counts = [1., 2, 3]
+dist.prob(counts)  # Shape [3]
+
+# p will be broadcast to [[.2, .3, .8], [.2, .3, .8]] to match counts.
+counts = [[1., 2, 1], [2, 2, 4]]
+dist.prob(counts)  # Shape [2, 3]
+
+# p will be broadcast to shape [5, 7, 3] to match counts.
+counts = [[...]]  # Shape [5, 7, 3]
+dist.prob(counts)  # Shape [5, 7, 3]
+```
+- - -
+
+#### `tf.contrib.distributions.Binomial.__init__(n, logits=None, p=None, validate_args=True, allow_nan_stats=False, name='Binomial')` {#Binomial.__init__}
+
+Initialize a batch of Binomial distributions.
+
+##### Args:
+
+
+*  <b>`n`</b>: Non-negative floating point tensor with shape broadcastable to
+    `[N1,..., Nm]` with `m >= 0` and the same dtype as `p` or `logits`.
+    Defines this as a batch of `N1 x ... x Nm` different Binomial
+    distributions. Its components should be equal to integer values.
+*  <b>`logits`</b>: Floating point tensor representing the log-odds of a
+    positive event with shape broadcastable to `[N1,..., Nm]` `m >= 0`, and
+    the same dtype as `n`. Each entry represents logits for the probability
+    of success for independent Binomial distributions.
+*  <b>`p`</b>: Positive floating point tensor with shape broadcastable to
+    `[N1,..., Nm]` `m >= 0`, `p in [0, 1]`. Each entry represents the
+    probability of success for independent Binomial distributions.
+*  <b>`validate_args`</b>: Whether to assert valid values for parameters `n` and `p`,
+    and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
+    guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
+*  <b>`name`</b>: The name to prefix Ops created by this distribution class.
+
+
+*  <b>`Examples`</b>: 
+
+```python
+# Define 1-batch of a binomial distribution.
+dist = Binomial(n=2., p=.9)
+
+# Define a 2-batch.
+dist = Binomial(n=[4., 5], p=[.1, .3])
+```
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.allow_nan_stats` {#Binomial.allow_nan_stats}
+
+Boolean describing behavior when a stat is undefined for batch member.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.batch_shape(name='batch_shape')` {#Binomial.batch_shape}
+
+Batch dimensions of this instance as a 1-D int32 `Tensor`.
+
+The product of the dimensions of the `batch_shape` is the number of
+independent distributions of this kind the instance represents.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `batch_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.cdf(value, name='cdf')` {#Binomial.cdf}
+
+Cumulative distribution function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.dtype` {#Binomial.dtype}
+
+dtype of samples from this distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.entropy(name='entropy')` {#Binomial.entropy}
+
+Entropy of the distribution in nats.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.event_shape(name='event_shape')` {#Binomial.event_shape}
+
+Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `event_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.get_batch_shape()` {#Binomial.get_batch_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `batch_shape`. May be only partially defined.
+
+##### Returns:
+
+  batch shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.get_event_shape()` {#Binomial.get_event_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `event_shape`. May be only partially defined.
+
+##### Returns:
+
+  event shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.is_continuous` {#Binomial.is_continuous}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.is_reparameterized` {#Binomial.is_reparameterized}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_cdf(value, name='log_cdf')` {#Binomial.log_cdf}
+
+Log CDF.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_pdf(value, name='log_pdf')` {#Binomial.log_pdf}
+
+Log of the probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_pmf(value, name='log_pmf')` {#Binomial.log_pmf}
+
+Log of the probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.log_prob(counts, name='log_prob')` {#Binomial.log_prob}
+
+`Log(P[counts])`, computed for every batch member.
+
+For each batch member of counts `k`, `P[counts]` is the probability that
+after sampling `n` draws from this Binomial distribution, the number of
+successes is `k`.  Note that different sequences of draws can result in the
+same counts, thus the probability includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.p` and `self.n`. `counts` is only legal if it is
+    less than or equal to `n` and its components are equal to integer
+    values.
+*  <b>`name`</b>: Name to give this Op, defaults to "log_prob".
+
+##### Returns:
+
+  Log probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.logits` {#Binomial.logits}
+
+Log-odds.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.mean(name='mean')` {#Binomial.mean}
+
+Mean of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.mode(name='mode')` {#Binomial.mode}
+
+Mode of the distribution.
+
+Note that when `(n + 1) * p` is an integer, there are actually two modes.
+Namely, `(n + 1) * p` and `(n + 1) * p - 1` are both modes. Here we return
+only the larger of the two modes.
+
+##### Args:
+
+
+*  <b>`name`</b>: The name for this op.
+
+##### Returns:
+
+  The mode of the Binomial distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.n` {#Binomial.n}
+
+Number of trials.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.name` {#Binomial.name}
+
+Name to prepend to all ops.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.p` {#Binomial.p}
+
+Probability of success.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.pdf(value, name='pdf')` {#Binomial.pdf}
+
+The probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.pmf(value, name='pmf')` {#Binomial.pmf}
+
+The probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.prob(counts, name='prob')` {#Binomial.prob}
+
+`P[counts]`, computed for every batch member.
+
+
+For each batch member of counts `k`, `P[counts]` is the probability that
+after sampling `n` draws from this Binomial distribution, the number of
+successes is `k`.  Note that different sequences of draws can result in the
+same counts, thus the probability includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.p` and `self.n`. `counts` is only legal if it is
+    less than or equal to `n` and its components are equal to integer
+    values.
+*  <b>`name`</b>: Name to give this Op, defaults to "prob".
+
+##### Returns:
+
+  Probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.sample(sample_shape=(), seed=None, name='sample')` {#Binomial.sample}
+
+Generate samples of the specified shape for each batched distribution.
+
+Note that a call to `sample()` without arguments will generate a single
+sample per batched distribution.
+
+##### Args:
+
+
+*  <b>`sample_shape`</b>: `int32` `Tensor` or tuple or list. Shape of the generated
+    samples.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of dtype `self.dtype` and shape
+      `sample_shape + self.batch_shape + self.event_shape`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.sample_n(n, seed=None, name='sample_n')` {#Binomial.sample_n}
+
+Generate `n` samples.
+
+##### Args:
+
+
+*  <b>`n`</b>: scalar. Number of samples to draw from each distribution.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of shape `(n,) + self.batch_shape + self.event_shape`
+      with values of type `self.dtype`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.std(name='std')` {#Binomial.std}
+
+Standard deviation of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.validate_args` {#Binomial.validate_args}
+
+Boolean describing behavior on invalid input.
+
+
+- - -
+
+#### `tf.contrib.distributions.Binomial.variance(name='variance')` {#Binomial.variance}
+
+Variance of the distribution.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md
index f5a88c11dd9..004dc294dca 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.DirichletMultinomial.md
@@ -67,32 +67,29 @@ dist.pmf(counts)  # Shape [2]
 ```
 - - -
 
-#### `tf.contrib.distributions.DirichletMultinomial.__init__(n, alpha, allow_arbitrary_counts=False, validate_args=True, allow_nan_stats=False, name='DirichletMultinomial')` {#DirichletMultinomial.__init__}
+#### `tf.contrib.distributions.DirichletMultinomial.__init__(n, alpha, validate_args=True, allow_nan_stats=False, name='DirichletMultinomial')` {#DirichletMultinomial.__init__}
 
 Initialize a batch of DirichletMultinomial distributions.
 
 ##### Args:
 
 
-*  <b>`n`</b>: Non-negative `float` or `double` tensor with shape
-    broadcastable to `[N1,..., Nm]` with `m >= 0`.  Defines this as a batch
-    of `N1 x ... x Nm` different Dirichlet multinomial distributions. Its
-    components should be equal to integral values.
-*  <b>`alpha`</b>: Positive `float` or `double` tensor with shape broadcastable to
-    `[N1,..., Nm, k]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
-     different `k` class Dirichlet multinomial distributions.
-*  <b>`allow_arbitrary_counts`</b>: Boolean. This represents whether the pmf/cdf
-    allows for the `counts` tensor to be non-integral values.
-    The pmf/cdf are functions that can be evaluated at non-integral values,
-    but are only a distribution over non-negative integers.  If
-    `validate_args` is `False`, this assertion is turned off.
+*  <b>`n`</b>: Non-negative floating point tensor, whose dtype is the same as
+    `alpha`. The shape is broadcastable to `[N1,..., Nm]` with `m >= 0`.
+    Defines this as a batch of `N1 x ... x Nm` different Dirichlet
+    multinomial distributions. Its components should be equal to integer
+    values.
+*  <b>`alpha`</b>: Positive floating point tensor, whose dtype is the same as
+    `n` with shape broadcastable to `[N1,..., Nm, k]` `m >= 0`.  Defines
+    this as a batch of `N1 x ... x Nm` different `k` class Dirichlet
+    multinomial distributions.
 *  <b>`validate_args`</b>: Whether to assert valid values for parameters `alpha` and
-    `n`, and `x` in `prob` and `log_prob`.  If False, correct behavior is
+    `n`, and `x` in `prob` and `log_prob`.  If `False`, correct behavior is
     not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 
@@ -254,12 +251,11 @@ probability includes a combinatorial coefficient.
 ##### Args:
 
 
-*  <b>`counts`</b>: Non-negative `float` or `double` tensor whose shape can
-    be broadcast with `self.alpha`.  For fixed leading dimensions, the last
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.alpha`.  For fixed leading dimensions, the last
     dimension represents counts for the corresponding Dirichlet Multinomial
     distribution in `self.alpha`. `counts` is only legal if it sums up to
-    `n` and its components are equal to integral values. The second
-    condition is relaxed if `allow_arbitrary_counts` is set.
+    `n` and its components are equal to integer values.
 *  <b>`name`</b>: Name to give this Op, defaults to "log_prob".
 
 ##### Returns:
@@ -324,12 +320,11 @@ probability includes a combinatorial coefficient.
 ##### Args:
 
 
-*  <b>`counts`</b>: Non-negative `float`, `double` tensor whose shape can
-    be broadcast with `self.alpha`.  For fixed leading dimensions, the last
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can be
+    broadcast with `self.alpha`.  For fixed leading dimensions, the last
     dimension represents counts for the corresponding Dirichlet Multinomial
     distribution in `self.alpha`. `counts` is only legal if it sums up to
-    `n` and its components are equal to integral values. The second
-    condition is relaxed if `allow_arbitrary_counts` is set.
+    `n` and its components are equal to integer values.
 *  <b>`name`</b>: Name to give this Op, defaults to "prob".
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md
index e785e49b2d8..745800ba7db 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Exponential.md
@@ -15,15 +15,15 @@ Construct Exponential distribution with parameter `lam`.
 ##### Args:
 
 
-*  <b>`lam`</b>: `float` or `double` tensor, the rate of the distribution(s).
+*  <b>`lam`</b>: Floating point tensor, the rate of the distribution(s).
     `lam` must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `lam > 0`, and that `x > 0` in the
-    methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+    methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member. If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md
index 741d4d8c08d..cc830c5c70d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Gamma.md
@@ -30,19 +30,19 @@ broadcasting (e.g. `alpha + beta` is a valid operation).
 ##### Args:
 
 
-*  <b>`alpha`</b>: `float` or `double` tensor, the shape params of the
+*  <b>`alpha`</b>: Floating point tensor, the shape params of the
     distribution(s).
     alpha must contain only positive values.
-*  <b>`beta`</b>: `float` or `double` tensor, the inverse scale params of the
+*  <b>`beta`</b>: Floating point tensor, the inverse scale params of the
     distribution(s).
     beta must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `a > 0, b > 0`, and that `x > 0` in
-    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 ##### Raises:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md
index 11b7ce9156c..cf788712cd7 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.InverseGamma.md
@@ -30,18 +30,18 @@ broadcasting (e.g. `alpha + beta` is a valid operation).
 ##### Args:
 
 
-*  <b>`alpha`</b>: `float` or `double` tensor, the shape params of the
+*  <b>`alpha`</b>: Floating point tensor, the shape params of the
     distribution(s).
     alpha must contain only positive values.
-*  <b>`beta`</b>: `float` or `double` tensor, the scale params of the distribution(s).
+*  <b>`beta`</b>: Floating point tensor, the scale params of the distribution(s).
     beta must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `a > 0, b > 0`, and that `x > 0` in
-    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is False
+    the methods `prob(x)` and `log_prob(x)`.  If `validate_args` is `False`
     and the inputs are invalid, correct behavior is not guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prepend to all ops created by this distribution.
 
 ##### Raises:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Multinomial.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Multinomial.md
new file mode 100644
index 00000000000..7ce70d130b5
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.Multinomial.md
@@ -0,0 +1,402 @@
+Multinomial distribution.
+
+This distribution is parameterized by a vector `p` of probability
+parameters for `k` classes and `n`, the counts per each class..
+
+#### Mathematical details
+
+The Multinomial is a distribution over k-class count data, meaning
+for each k-tuple of non-negative integer `counts = [n_1,...,n_k]`, we have a
+probability of these draws being made from the distribution.  The distribution
+has hyperparameters `p = (p_1,...,p_k)`, and probability mass
+function (pmf):
+
+```pmf(counts) = n! / (n_1!...n_k!) * (p_1)^n_1*(p_2)^n_2*...(p_k)^n_k```
+
+where above `n = sum_j n_j`, `n!` is `n` factorial.
+
+#### Examples
+
+Create a 3-class distribution, with the 3rd class is most likely to be drawn,
+using logits..
+
+```python
+logits = [-50., -43, 0]
+dist = Multinomial(n=4., logits=logits)
+```
+
+Create a 3-class distribution, with the 3rd class is most likely to be drawn.
+
+```python
+p = [.2, .3, .5]
+dist = Multinomial(n=4., p=p)
+```
+
+The distribution functions can be evaluated on counts.
+
+```python
+# counts same shape as p.
+counts = [1., 0, 3]
+dist.prob(counts)  # Shape []
+
+# p will be broadcast to [[.2, .3, .5], [.2, .3, .5]] to match counts.
+counts = [[1., 2, 1], [2, 2, 0]]
+dist.prob(counts)  # Shape [2]
+
+# p will be broadcast to shape [5, 7, 3] to match counts.
+counts = [[...]]  # Shape [5, 7, 3]
+dist.prob(counts)  # Shape [5, 7]
+```
+
+Create a 2-batch of 3-class distributions.
+
+```python
+p = [[.1, .2, .7], [.3, .3, .4]]  # Shape [2, 3]
+dist = Multinomial(n=[4., 5], p=p)
+
+counts = [[2., 1, 1], [3, 1, 1]]
+dist.prob(counts)  # Shape [2]
+```
+- - -
+
+#### `tf.contrib.distributions.Multinomial.__init__(n, logits=None, p=None, validate_args=True, allow_nan_stats=False, name='Multinomial')` {#Multinomial.__init__}
+
+Initialize a batch of Multinomial distributions.
+
+##### Args:
+
+
+*  <b>`n`</b>: Non-negative floating point tensor with shape broadcastable to
+    `[N1,..., Nm]` with `m >= 0`. Defines this as a batch of
+    `N1 x ... x Nm` different Multinomial distributions.  Its components
+    should be equal to integer values.
+*  <b>`logits`</b>: Floating point tensor representing the log-odds of a
+    positive event with shape broadcastable to `[N1,..., Nm, k], m >= 0`,
+    and the same dtype as `n`. Defines this as a batch of `N1 x ... x Nm`
+    different `k` class Multinomial distributions.
+*  <b>`p`</b>: Positive floating point tensor with shape broadcastable to
+    `[N1,..., Nm, k]` `m >= 0` and same dtype as `n`.  Defines this as
+    a batch of `N1 x ... x Nm` different `k` class Multinomial
+    distributions. `p`'s components in the last portion of its shape should
+    sum up to 1.
+*  <b>`validate_args`</b>: Whether to assert valid values for parameters `n` and `p`,
+    and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
+    guaranteed.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
+*  <b>`name`</b>: The name to prefix Ops created by this distribution class.
+
+
+*  <b>`Examples`</b>: 
+
+```python
+# Define 1-batch of 2-class multinomial distribution,
+# also known as a Binomial distribution.
+dist = Multinomial(n=2., p=[.1, .9])
+
+# Define a 2-batch of 3-class distributions.
+dist = Multinomial(n=[4., 5], p=[[.1, .3, .6], [.4, .05, .55]])
+```
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.allow_nan_stats` {#Multinomial.allow_nan_stats}
+
+Boolean describing behavior when a stat is undefined for batch member.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.batch_shape(name='batch_shape')` {#Multinomial.batch_shape}
+
+Batch dimensions of this instance as a 1-D int32 `Tensor`.
+
+The product of the dimensions of the `batch_shape` is the number of
+independent distributions of this kind the instance represents.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `batch_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.cdf(value, name='cdf')` {#Multinomial.cdf}
+
+Cumulative distribution function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.dtype` {#Multinomial.dtype}
+
+dtype of samples from this distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.entropy(name='entropy')` {#Multinomial.entropy}
+
+Entropy of the distribution in nats.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.event_shape(name='event_shape')` {#Multinomial.event_shape}
+
+Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
+
+##### Args:
+
+
+*  <b>`name`</b>: name to give to the op
+
+##### Returns:
+
+  `Tensor` `event_shape`
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.get_batch_shape()` {#Multinomial.get_batch_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `batch_shape`. May be only partially defined.
+
+##### Returns:
+
+  batch shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.get_event_shape()` {#Multinomial.get_event_shape}
+
+`TensorShape` available at graph construction time.
+
+Same meaning as `event_shape`. May be only partially defined.
+
+##### Returns:
+
+  event shape
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.is_continuous` {#Multinomial.is_continuous}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.is_reparameterized` {#Multinomial.is_reparameterized}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_cdf(value, name='log_cdf')` {#Multinomial.log_cdf}
+
+Log CDF.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_pdf(value, name='log_pdf')` {#Multinomial.log_pdf}
+
+Log of the probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_pmf(value, name='log_pmf')` {#Multinomial.log_pmf}
+
+Log of the probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.log_prob(counts, name='log_prob')` {#Multinomial.log_prob}
+
+`Log(P[counts])`, computed for every batch member.
+
+For each batch of counts `[n_1,...,n_k]`, `P[counts]` is the probability
+that after sampling `n` draws from this Multinomial distribution, the
+number of draws falling in class `j` is `n_j`.  Note that different
+sequences of draws can result in the same counts, thus the probability
+includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can
+    be broadcast with `self.p` and `self.n`.  For fixed leading dimensions,
+    the last dimension represents counts for the corresponding Multinomial
+    distribution in `self.p`. `counts` is only legal if it sums up to `n`
+    and its components are equal to integer values.
+*  <b>`name`</b>: Name to give this Op, defaults to "log_prob".
+
+##### Returns:
+
+  Log probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.logits` {#Multinomial.logits}
+
+Log-odds.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.mean(name='mean')` {#Multinomial.mean}
+
+Mean of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.mode(name='mode')` {#Multinomial.mode}
+
+Mode of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.n` {#Multinomial.n}
+
+Number of trials.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.name` {#Multinomial.name}
+
+Name to prepend to all ops.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.p` {#Multinomial.p}
+
+Event probabilities.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.pdf(value, name='pdf')` {#Multinomial.pdf}
+
+The probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.pmf(value, name='pmf')` {#Multinomial.pmf}
+
+The probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.prob(counts, name='prob')` {#Multinomial.prob}
+
+`P[counts]`, computed for every batch member.
+
+For each batch of counts `[n_1,...,n_k]`, `P[counts]` is the probability
+that after sampling `n` draws from this Multinomial distribution, the
+number of draws falling in class `j` is `n_j`.  Note that different
+sequences of draws can result in the same counts, thus the probability
+includes a combinatorial coefficient.
+
+##### Args:
+
+
+*  <b>`counts`</b>: Non-negative tensor with dtype `dtype` and whose shape can
+    be broadcast with `self.p` and `self.n`.  For fixed leading dimensions,
+    the last dimension represents counts for the corresponding Multinomial
+    distribution in `self.p`. `counts` is only legal if it sums up to `n`
+    and its components are equal to integer values.
+*  <b>`name`</b>: Name to give this Op, defaults to "prob".
+
+##### Returns:
+
+  Probabilities for each record, shape `[N1,...,Nm]`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.sample(sample_shape=(), seed=None, name='sample')` {#Multinomial.sample}
+
+Generate samples of the specified shape for each batched distribution.
+
+Note that a call to `sample()` without arguments will generate a single
+sample per batched distribution.
+
+##### Args:
+
+
+*  <b>`sample_shape`</b>: `int32` `Tensor` or tuple or list. Shape of the generated
+    samples.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of dtype `self.dtype` and shape
+      `sample_shape + self.batch_shape + self.event_shape`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.sample_n(n, seed=None, name='sample_n')` {#Multinomial.sample_n}
+
+Generate `n` samples.
+
+##### Args:
+
+
+*  <b>`n`</b>: scalar. Number of samples to draw from each distribution.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of shape `(n,) + self.batch_shape + self.event_shape`
+      with values of type `self.dtype`.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.std(name='std')` {#Multinomial.std}
+
+Standard deviation of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.validate_args` {#Multinomial.validate_args}
+
+Boolean describing behavior on invalid input.
+
+
+- - -
+
+#### `tf.contrib.distributions.Multinomial.variance(name='variance')` {#Multinomial.variance}
+
+Variance of the distribution.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.md
new file mode 100644
index 00000000000..4c6b99b4c3d
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.md
@@ -0,0 +1,387 @@
+The multivariate normal distribution on `R^k`.
+
+Every batch member of this distribution is defined by a mean and a lightweight
+covariance matrix `C`.
+
+#### Mathematical details
+
+The PDF of this distribution in terms of the mean `mu` and covariance `C` is:
+
+```
+f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
+```
+
+For every batch member, this distribution represents `k` random variables
+`(X_1,...,X_k)`, with mean `E[X_i] = mu[i]`, and covariance matrix
+`C_{ij} := E[(X_i - mu[i])(X_j - mu[j])]`
+
+The user initializes this class by providing the mean `mu`, and a lightweight
+definition of `C`:
+
+```
+C = SS^T = SS = (M + V D V^T) (M + V D V^T)
+M is diagonal (k x k)
+V = is shape (k x r), typically r << k
+D = is diagonal (r x r), optional (defaults to identity).
+```
+
+This allows for `O(kr + r^3)` pdf evaluation and determinant, and `O(kr)`
+sampling and storage (per batch member).
+
+#### Examples
+
+A single multi-variate Gaussian distribution is defined by a vector of means
+of length `k`, and square root of the covariance `S = M + V D V^T`.  Extra
+leading dimensions, if provided, allow for batches.
+
+```python
+# Initialize a single 3-variate Gaussian with covariance square root
+# S = M + V D V^T, where V D V^T is a matrix-rank 2 update.
+mu = [1, 2, 3.]
+diag_large = [1.1, 2.2, 3.3]
+v = ... # shape 3 x 2
+diag_small = [4., 5.]
+dist = tf.contrib.distributions.MultivariateNormalDiagPlusVDVT(
+    mu, diag_large, v, diag_small=diag_small)
+
+# Evaluate this on an observation in R^3, returning a scalar.
+dist.pdf([-1, 0, 1])
+
+# Initialize a batch of two 3-variate Gaussians.  This time, don't provide
+# diag_small.  This means S = M + V V^T.
+mu = [[1, 2, 3], [11, 22, 33]]  # shape 2 x 3
+diag_large = ... # shape 2 x 3
+v = ... # shape 2 x 3 x 1, a matrix-rank 1 update.
+dist = tf.contrib.distributions.MultivariateNormalDiagPlusVDVT(
+    mu, diag_large, v)
+
+# Evaluate this on a two observations, each in R^3, returning a length two
+# tensor.
+x = [[-1, 0, 1], [-11, 0, 11]]  # Shape 2 x 3.
+dist.pdf(x)
+```
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.__init__(mu, diag_large, v, diag_small=None, validate_args=True, allow_nan_stats=False, name='MultivariateNormalDiagPlusVDVT')` {#MultivariateNormalDiagPlusVDVT.__init__}
+
+Multivariate Normal distributions on `R^k`.
+
+For every batch member, this distribution represents `k` random variables
+`(X_1,...,X_k)`, with mean `E[X_i] = mu[i]`, and covariance matrix
+`C_{ij} := E[(X_i - mu[i])(X_j - mu[j])]`
+
+The user initializes this class by providing the mean `mu`, and a
+lightweight definition of `C`:
+
+```
+C = SS^T = SS = (M + V D V^T) (M + V D V^T)
+M is diagonal (k x k)
+V = is shape (k x r), typically r << k
+D = is diagonal (r x r), optional (defaults to identity).
+```
+
+##### Args:
+
+
+*  <b>`mu`</b>: Rank `n + 1` floating point tensor with shape `[N1,...,Nn, k]`,
+    `n >= 0`.  The means.
+*  <b>`diag_large`</b>: Optional rank `n + 1` floating point tensor, shape
+    `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `M`.
+*  <b>`v`</b>: Rank `n + 1` floating point tensor, shape `[N1,...,Nn, k, r]`
+    `n >= 0`.  Defines the matrix `V`.
+*  <b>`diag_small`</b>: Rank `n + 1` floating point tensor, shape
+    `[N1,...,Nn, k]` `n >= 0`.  Defines the diagonal matrix `D`.  Default
+    is `None`, which means `D` will be the identity matrix.
+*  <b>`validate_args`</b>: Whether to validate input with asserts.  If `validate_args`
+    is `False`,
+    and the inputs are invalid, correct behavior is not guaranteed.
+*  <b>`allow_nan_stats`</b>: `Boolean`, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
+*  <b>`name`</b>: The name to give Ops created by the initializer.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.allow_nan_stats` {#MultivariateNormalDiagPlusVDVT.allow_nan_stats}
+
+`Boolean` describing behavior when stats are undefined.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.batch_shape(name='batch_shape')` {#MultivariateNormalDiagPlusVDVT.batch_shape}
+
+Batch dimensions of this instance as a 1-D int32 `Tensor`.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.cdf(value, name='cdf')` {#MultivariateNormalDiagPlusVDVT.cdf}
+
+Cumulative distribution function.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.dtype` {#MultivariateNormalDiagPlusVDVT.dtype}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.entropy(name='entropy')` {#MultivariateNormalDiagPlusVDVT.entropy}
+
+The entropies of these Multivariate Normals.
+
+##### Args:
+
+
+*  <b>`name`</b>: The name to give this op.
+
+##### Returns:
+
+
+*  <b>`entropy`</b>: tensor of dtype `dtype`, the entropies.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.event_shape(name='event_shape')` {#MultivariateNormalDiagPlusVDVT.event_shape}
+
+Shape of a sample from a single distribution as a 1-D int32 `Tensor`.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.get_batch_shape()` {#MultivariateNormalDiagPlusVDVT.get_batch_shape}
+
+`TensorShape` available at graph construction time.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.get_event_shape()` {#MultivariateNormalDiagPlusVDVT.get_event_shape}
+
+`TensorShape` available at graph construction time.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.is_continuous` {#MultivariateNormalDiagPlusVDVT.is_continuous}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.is_reparameterized` {#MultivariateNormalDiagPlusVDVT.is_reparameterized}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_cdf(value, name='log_cdf')` {#MultivariateNormalDiagPlusVDVT.log_cdf}
+
+Log CDF.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_pdf(value, name='log_pdf')` {#MultivariateNormalDiagPlusVDVT.log_pdf}
+
+Log of the probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_pmf(value, name='log_pmf')` {#MultivariateNormalDiagPlusVDVT.log_pmf}
+
+Log of the probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_prob(x, name='log_prob')` {#MultivariateNormalDiagPlusVDVT.log_prob}
+
+Log prob of observations `x` given these Multivariate Normals.
+
+`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
+shape can be broadcast up to either:
+
+````
+self.batch_shape + self.event_shape
+OR
+[M1,...,Mm] + self.batch_shape + self.event_shape
+```
+
+##### Args:
+
+
+*  <b>`x`</b>: Compatible batch vector with same `dtype` as this distribution.
+*  <b>`name`</b>: The name to give this op.
+
+##### Returns:
+
+
+*  <b>`log_prob`</b>: tensor of dtype `dtype`, the log-PDFs of `x`.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.log_sigma_det(name='log_sigma_det')` {#MultivariateNormalDiagPlusVDVT.log_sigma_det}
+
+Log of determinant of covariance matrix.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.mean(name='mean')` {#MultivariateNormalDiagPlusVDVT.mean}
+
+Mean of each batch member.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.mode(name='mode')` {#MultivariateNormalDiagPlusVDVT.mode}
+
+Mode of each batch member.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.mu` {#MultivariateNormalDiagPlusVDVT.mu}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.name` {#MultivariateNormalDiagPlusVDVT.name}
+
+
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.pdf(value, name='pdf')` {#MultivariateNormalDiagPlusVDVT.pdf}
+
+The probability density function.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.pmf(value, name='pmf')` {#MultivariateNormalDiagPlusVDVT.pmf}
+
+The probability mass function.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.prob(x, name='prob')` {#MultivariateNormalDiagPlusVDVT.prob}
+
+The PDF of observations `x` under these Multivariate Normals.
+
+`x` is a batch vector with compatible shape if `x` is a `Tensor` whose
+shape can be broadcast up to either:
+
+````
+self.batch_shape + self.event_shape
+OR
+[M1,...,Mm] + self.batch_shape + self.event_shape
+```
+
+##### Args:
+
+
+*  <b>`x`</b>: Compatible batch vector with same `dtype` as this distribution.
+*  <b>`name`</b>: The name to give this op.
+
+##### Returns:
+
+
+*  <b>`prob`</b>: tensor of dtype `dtype`, the prob values of `x`.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.sample(sample_shape=(), seed=None, name='sample')` {#MultivariateNormalDiagPlusVDVT.sample}
+
+Generate samples of the specified shape for each batched distribution.
+
+Note that a call to `sample()` without arguments will generate a single
+sample per batched distribution.
+
+##### Args:
+
+
+*  <b>`sample_shape`</b>: `int32` `Tensor` or tuple or list. Shape of the generated
+    samples.
+*  <b>`seed`</b>: Python integer seed for RNG
+*  <b>`name`</b>: name to give to the op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: a `Tensor` of dtype `self.dtype` and shape
+      `sample_shape + self.batch_shape + self.event_shape`.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.sample_n(n, seed=None, name='sample_n')` {#MultivariateNormalDiagPlusVDVT.sample_n}
+
+Sample `n` observations from the Multivariate Normal Distributions.
+
+##### Args:
+
+
+*  <b>`n`</b>: `Scalar`, type int32, the number of observations to sample.
+*  <b>`seed`</b>: Python integer, the random seed.
+*  <b>`name`</b>: The name to give this op.
+
+##### Returns:
+
+
+*  <b>`samples`</b>: `[n, ...]`, a `Tensor` of `n` samples for each
+    of the distributions determined by broadcasting the hyperparameters.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.sigma` {#MultivariateNormalDiagPlusVDVT.sigma}
+
+Dense (batch) covariance matrix, if available.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.sigma_det(name='sigma_det')` {#MultivariateNormalDiagPlusVDVT.sigma_det}
+
+Determinant of covariance matrix.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.std(name='std')` {#MultivariateNormalDiagPlusVDVT.std}
+
+Standard deviation of the distribution.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.validate_args` {#MultivariateNormalDiagPlusVDVT.validate_args}
+
+`Boolean` describing behavior on invalid input.
+
+
+- - -
+
+#### `tf.contrib.distributions.MultivariateNormalDiagPlusVDVT.variance(name='variance')` {#MultivariateNormalDiagPlusVDVT.variance}
+
+Variance of each batch member.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.detach_outputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.detach_outputs.md
new file mode 100644
index 00000000000..7ef04022163
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.detach_outputs.md
@@ -0,0 +1,23 @@
+### `tf.contrib.graph_editor.detach_outputs(sgv, control_outputs=None)` {#detach_outputs}
+
+Detach the outputa of a subgraph view.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
+    subgraph using the same rules as the function subgraph.make_view.
+*  <b>`control_outputs`</b>: a util.ControlOutputs instance or None. If not None the
+    control outputs are also detached.
+
+##### Returns:
+
+  A new subgraph view of the detached subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.matcher.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.matcher.md
new file mode 100644
index 00000000000..242efb37e3f
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.matcher.md
@@ -0,0 +1,29 @@
+Graph match class.
+- - -
+
+#### `tf.contrib.graph_editor.matcher.__init__(positive_filter)` {#matcher.__init__}
+
+Graph match constructor.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.matcher.control_input_ops(*args)` {#matcher.control_input_ops}
+
+Add input matches.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.matcher.input_ops(*args)` {#matcher.input_ops}
+
+Add input matches.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.matcher.output_ops(*args)` {#matcher.output_ops}
+
+Add output matches.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.reroute_b2a.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.reroute_b2a.md
new file mode 100644
index 00000000000..f15af87d5eb
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.reroute_b2a.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.reroute_b2a(sgv0, sgv1)` {#reroute_b2a}
+
+Re-route the inputs and outputs of sgv1 to sgv0 (see _reroute).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.select_ts.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.select_ts.md
new file mode 100644
index 00000000000..22905da75da
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.select_ts.md
@@ -0,0 +1,30 @@
+### `tf.contrib.graph_editor.select_ts(*args, **kwargs)` {#select_ts}
+
+Helper to select tensors.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Tensor. tf.Operation instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ts_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ts)".
+
+##### Returns:
+
+  list of tf.Tensor
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Tensor
+    or an (array of) tf.Operation (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.swap_outputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.swap_outputs.md
new file mode 100644
index 00000000000..31ed5df8d41
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.graph_editor.swap_outputs.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.swap_outputs(sgv0, sgv1)` {#swap_outputs}
+
+Swap all the outputs of sgv0 and sgv1 (see _reroute_outputs).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md
index 9822437283f..d292d350493 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.Estimator.md
@@ -32,9 +32,9 @@ Constructs an Estimator instance.
              to configure Estimators from hyper parameter tunning.
 
 
-*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can also
-    be used to load checkpoints from the directory into a estimator to continue
-    training a previously saved model.
+*  <b>`model_dir`</b>: Directory to save model parameters, graph and etc. This can
+    also be used to load checkpoints from the directory into a estimator to
+    continue training a previously saved model.
 *  <b>`config`</b>: Configuration object.
 *  <b>`params`</b>: `dict` of hyper parameters that will be passed into `model_fn`.
           Keys are names of parameters, values are basic python types.
@@ -49,56 +49,7 @@ Constructs an Estimator instance.
 
 #### `tf.contrib.learn.Estimator.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#Estimator.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -112,37 +63,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.Estimator.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#Estimator.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummarySaver.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummarySaver.md
index 9d42226216e..cc016557ca4 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummarySaver.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.learn.monitors.SummarySaver.md
@@ -1,7 +1,7 @@
 Saves summaries every N steps.
 - - -
 
-#### `tf.contrib.learn.monitors.SummarySaver.__init__(summary_op, save_steps=100, output_dir=None, summary_writer=None)` {#SummarySaver.__init__}
+#### `tf.contrib.learn.monitors.SummarySaver.__init__(summary_op, save_steps=100, output_dir=None, summary_writer=None, scaffold=None)` {#SummarySaver.__init__}
 
 Initializes a `SummarySaver` monitor.
 
@@ -16,6 +16,7 @@ Initializes a `SummarySaver` monitor.
       if no `summary_writer` is supplied.
 *  <b>`summary_writer`</b>: `SummaryWriter`. If `None` and an `output_dir` was passed,
       one will be created accordingly.
+*  <b>`scaffold`</b>: `Scaffold` to get summary_op if it's not provided.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.CoupledInputForgetGateLSTMCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.CoupledInputForgetGateLSTMCell.md
new file mode 100644
index 00000000000..0e36b224bc6
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.contrib.rnn.CoupledInputForgetGateLSTMCell.md
@@ -0,0 +1,93 @@
+Long short-term memory unit (LSTM) recurrent network cell.
+
+The default non-peephole implementation is based on:
+
+  http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+
+S. Hochreiter and J. Schmidhuber.
+"Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
+
+The peephole implementation is based on:
+
+  https://research.google.com/pubs/archive/43905.pdf
+
+Hasim Sak, Andrew Senior, and Francoise Beaufays.
+"Long short-term memory recurrent neural network architectures for
+ large scale acoustic modeling." INTERSPEECH, 2014.
+
+The coupling of input and forget gate is based on:
+
+  http://arxiv.org/pdf/1503.04069.pdf
+
+Greff et al. "LSTM: A Search Space Odyssey"
+
+The class uses optional peep-hole connections, and an optional projection
+layer.
+- - -
+
+#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.__init__(num_units, use_peepholes=False, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=1, num_proj_shards=1, forget_bias=1.0, state_is_tuple=False, activation=tanh)` {#CoupledInputForgetGateLSTMCell.__init__}
+
+Initialize the parameters for an LSTM cell.
+
+##### Args:
+
+
+*  <b>`num_units`</b>: int, The number of units in the LSTM cell
+*  <b>`use_peepholes`</b>: bool, set True to enable diagonal/peephole connections.
+*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
+    projection matrices.
+*  <b>`num_proj`</b>: (optional) int, The output dimensionality for the projection
+    matrices.  If None, no projection is performed.
+*  <b>`proj_clip`</b>: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
+  provided, then the projected values are clipped elementwise to within
+  `[-proj_clip, proj_clip]`.
+
+*  <b>`num_unit_shards`</b>: How to split the weight matrix.  If >1, the weight
+    matrix is stored across num_unit_shards.
+*  <b>`num_proj_shards`</b>: How to split the projection matrix.  If >1, the
+    projection matrix is stored across num_proj_shards.
+*  <b>`forget_bias`</b>: Biases of the forget gate are initialized by default to 1
+    in order to reduce the scale of forgetting at the beginning of
+    the training.
+*  <b>`state_is_tuple`</b>: If True, accepted and returned states are 2-tuples of
+    the `c_state` and `m_state`.  By default (False), they are concatenated
+    along the column axis.  This default behavior will soon be deprecated.
+*  <b>`activation`</b>: Activation function of the inner states.
+
+
+- - -
+
+#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.output_size` {#CoupledInputForgetGateLSTMCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.state_size` {#CoupledInputForgetGateLSTMCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.CoupledInputForgetGateLSTMCell.zero_state(batch_size, dtype)` {#CoupledInputForgetGateLSTMCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.svd.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.svd.md
new file mode 100644
index 00000000000..0c9f0aacf0d
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard3/tf.svd.md
@@ -0,0 +1,39 @@
+### `tf.svd(matrix, compute_uv=True, full_matrices=False, name=None)` {#svd}
+
+Computes the singular value decomposition of a matrix.
+
+Computes the SVD of `matrix` such that `matrix = u * diag(s) *
+transpose(v)`
+
+```prettyprint
+# a is a matrix.
+# s is a vector of singular values.
+# u is the matrix of left singular vectors.
+# v is a matrix of right singular vectors.
+s, u, v = svd(a)
+s = svd(a, compute_uv=False)
+```
+
+##### Args:
+
+
+*  <b>`matrix`</b>: `Tensor` of shape `[M, N]`. Let `P` be the minimum of `M` and `N`.
+*  <b>`compute_uv`</b>: If `True` then left and right singular vectors will be
+    computed and returned in `u` and `v`, respectively. Otherwise, only the
+    singular values will be computed, which can be significantly faster.
+*  <b>`full_matrices`</b>: If true, compute full-sized `u` and `v`. If false
+    (the default), compute only the leading `P` singular vectors.
+    Ignored if `compute_uv` is `False`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`s`</b>: Singular values. Shape is `[P]`.
+*  <b>`u`</b>: Right singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[M, P]`; if `full_matrices` is `True` then shape is
+    `[M, M]`. Not returned if `compute_uv` is `False`.
+*  <b>`v`</b>: Left singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[N, P]`. If `full_matrices` is `True` then shape is
+    `[N, N]`. Not returned if `compute_uv` is `False`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.assert_less.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.assert_less.md
index eb43a624441..e05c81c2950 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.assert_less.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.assert_less.md
@@ -1,4 +1,4 @@
-### `tf.assert_less(x, y, data=None, summarize=None, name=None)` {#assert_less}
+### `tf.assert_less(x, y, data=None, summarize=None, message=None, name=None)` {#assert_less}
 
 Assert the condition `x < y` holds element-wise.
 
@@ -27,6 +27,7 @@ If both `x` and `y` are empty, this is trivially satisfied.
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`, `y`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_less".
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.batch_svd.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.batch_svd.md
new file mode 100644
index 00000000000..2555bb57e30
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.batch_svd.md
@@ -0,0 +1,41 @@
+### `tf.batch_svd(tensor, compute_uv=True, full_matrices=False, name=None)` {#batch_svd}
+
+Computes the singular value decompositions of a batch of matrices.
+
+Computes the SVD of each inner matrix in `tensor` such that
+`tensor[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :,
+:])`
+
+```prettyprint
+# a is a tensor.
+# s is a tensor of singular values.
+# u is a tensor of left singular vectors.
+# v is a tensor of right singular vectors.
+s, u, v = batch_svd(a)
+s = batch_svd(a, compute_uv=False)
+```
+
+##### Args:
+
+
+*  <b>`matrix`</b>: `Tensor` of shape `[..., M, N]`. Let `P` be the minimum of `M` and
+    `N`.
+*  <b>`compute_uv`</b>: If `True` then left and right singular vectors will be
+    computed and returned in `u` and `v`, respectively. Otherwise, only the
+    singular values will be computed, which can be significantly faster.
+*  <b>`full_matrices`</b>: If true, compute full-sized `u` and `v`. If false
+    (the default), compute only the leading `P` singular vectors.
+    Ignored if `compute_uv` is `False`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`s`</b>: Singular values. Shape is `[..., P]`.
+*  <b>`u`</b>: Right singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+    `[..., M, M]`. Not returned if `compute_uv` is `False`.
+*  <b>`v`</b>: Left singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[..., N, P]`. If `full_matrices` is `True` then shape is
+    `[..., N, N]`. Not returned if `compute_uv` is `False`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.sgv_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.sgv_scope.md
new file mode 100644
index 00000000000..6362e0d99f0
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.graph_editor.sgv_scope.md
@@ -0,0 +1,14 @@
+### `tf.contrib.graph_editor.sgv_scope(scope, graph)` {#sgv_scope}
+
+Make a subgraph from a name scope.
+
+##### Args:
+
+
+*  <b>`scope`</b>: the name of the scope.
+*  <b>`graph`</b>: the tf.Graph.
+
+##### Returns:
+
+  A subgraph view representing the given scope.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.avg_pool2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.avg_pool2d.md
index b10aeaef094..a2ef9010ea5 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.avg_pool2d.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.layers.avg_pool2d.md
@@ -1,25 +1,24 @@
 ### `tf.contrib.layers.avg_pool2d(*args, **kwargs)` {#avg_pool2d}
 
-Adds a Avg Pooling op.
+Adds a 2D average pooling op.
 
-It is assumed by the wrapper that the pooling is only done per image and not
-in depth or batch.
+It is assumed that the pooling is done per image but not in batch or channels.
 
 ##### Args:
 
 
-*  <b>`inputs`</b>: a tensor of size [batch_size, height, width, depth].
-*  <b>`kernel_size`</b>: a list of length 2: [kernel_height, kernel_width] of the
+*  <b>`inputs`</b>: A `Tensor` of size [batch_size, height, width, channels].
+*  <b>`kernel_size`</b>: A list of length 2: [kernel_height, kernel_width] of the
     pooling kernel over which the op is computed. Can be an int if both
     values are the same.
-*  <b>`stride`</b>: a list of length 2: [stride_height, stride_width].
-    Can be an int if both strides are the same.  Note that presently
+*  <b>`stride`</b>: A list of length 2: [stride_height, stride_width].
+    Can be an int if both strides are the same. Note that presently
     both strides must have the same value.
-*  <b>`padding`</b>: the padding method, either 'VALID' or 'SAME'.
-*  <b>`outputs_collections`</b>: collection to add the outputs.
+*  <b>`padding`</b>: The padding method, either 'VALID' or 'SAME'.
+*  <b>`outputs_collections`</b>: The collections to which the outputs are added.
 *  <b>`scope`</b>: Optional scope for op_scope.
 
 ##### Returns:
 
-  a tensor representing the results of the pooling operation.
+  A `Tensor` representing the results of the pooling operation.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
index 56b5edb17c0..767756e311f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.DNNClassifier.md
@@ -120,56 +120,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.DNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#DNNClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -183,37 +134,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.DNNClassifier.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#DNNClassifier.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.TensorFlowLinearClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.TensorFlowLinearClassifier.md
index c5f0eab6b7d..2c7b221fb22 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.TensorFlowLinearClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.learn.TensorFlowLinearClassifier.md
@@ -31,56 +31,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowLinearClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowLinearClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.rnn.TimeFreqLSTMCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.rnn.TimeFreqLSTMCell.md
new file mode 100644
index 00000000000..e870477b6ba
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.contrib.rnn.TimeFreqLSTMCell.md
@@ -0,0 +1,70 @@
+Time-Frequency Long short-term memory unit (LSTM) recurrent network cell.
+
+This implementation is based on:
+
+  Tara N. Sainath and Bo Li
+  "Modeling Time-Frequency Patterns with LSTM vs. Convolutional Architectures
+  for LVCSR Tasks." submitted to INTERSPEECH, 2016.
+
+It uses peep-hole connections and optional cell clipping.
+- - -
+
+#### `tf.contrib.rnn.TimeFreqLSTMCell.__init__(num_units, use_peepholes=False, cell_clip=None, initializer=None, num_unit_shards=1, forget_bias=1.0, feature_size=None, frequency_skip=None)` {#TimeFreqLSTMCell.__init__}
+
+Initialize the parameters for an LSTM cell.
+
+##### Args:
+
+
+*  <b>`num_units`</b>: int, The number of units in the LSTM cell
+*  <b>`use_peepholes`</b>: bool, set True to enable diagonal/peephole connections.
+*  <b>`cell_clip`</b>: (optional) A float value, if provided the cell state is clipped
+    by this value prior to the cell output activation.
+*  <b>`initializer`</b>: (optional) The initializer to use for the weight and
+    projection matrices.
+*  <b>`num_unit_shards`</b>: int, How to split the weight matrix.  If >1, the weight
+    matrix is stored across num_unit_shards.
+*  <b>`forget_bias`</b>: float, Biases of the forget gate are initialized by default
+    to 1 in order to reduce the scale of forgetting at the beginning
+    of the training.
+*  <b>`feature_size`</b>: int, The size of the input feature the LSTM spans over.
+*  <b>`frequency_skip`</b>: int, The amount the LSTM filter is shifted by in
+    frequency.
+
+
+- - -
+
+#### `tf.contrib.rnn.TimeFreqLSTMCell.output_size` {#TimeFreqLSTMCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.TimeFreqLSTMCell.state_size` {#TimeFreqLSTMCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.TimeFreqLSTMCell.zero_state(batch_size, dtype)` {#TimeFreqLSTMCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.greater.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.greater.md
index c629a0286f3..99b34aaca47 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.greater.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.greater.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of (x > y) element-wise.
 
+*NOTE*: `Greater` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d.md
index 5fee10cf401..067d223b1d4 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.depthwise_conv2d.md
@@ -28,7 +28,8 @@ same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
 *  <b>`strides`</b>: 1-D of size 4.  The stride of the sliding window for each
     dimension of `input`.
 *  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+    See the [comment
+      here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
 *  <b>`name`</b>: A name for this operation (optional).
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.log_poisson_loss.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.log_poisson_loss.md
new file mode 100644
index 00000000000..bd7117a2732
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.nn.log_poisson_loss.md
@@ -0,0 +1,44 @@
+### `tf.nn.log_poisson_loss(log_input, targets, compute_full_loss=False, name=None)` {#log_poisson_loss}
+
+Computes log poisson loss given `log_input`.
+
+Gives the log-likelihood loss between the prediction and the target under the
+assumption that the target has a poisson distribution.
+Caveat: By default, this is not the exact loss, but the loss minus a
+  constant term [log(z!)]. That has no effect for optimization, but
+  does not play well with relative loss comparisons. To compute an
+  approximation of the log factorial term, specify
+  compute_full_loss=True to enable Stirling's Approximation.
+
+For brevity, let `c = log(x) = log_input`, `z = targets`.  The log poisson
+loss is
+
+      -log(exp(-x) * (x^z) / z!)
+    = -log(exp(-x) * (x^z)) + log(z!)
+    ~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
+        [ Note the second term is the Stirling's Approximation for log(z!).
+          It is invariant to x and does not affect optimization, though
+          important for correct relative loss comparisons. It is only
+          computed when compute_full_loss == True. ]
+    = x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
+    = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
+
+##### Args:
+
+
+*  <b>`log_input`</b>: A `Tensor` of type `float32` or `float64`.
+*  <b>`targets`</b>: A `Tensor` of the same type and shape as `log_input`.
+*  <b>`compute_full_loss`</b>: whether to compute the full loss. If false, a constant
+    term is dropped in favor of more efficient optimization.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor` of the same shape as `log_input` with the componentwise
+  logistic losses.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `log_input` and `targets` do not have the same shape.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.self_adjoint_eigvals.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.self_adjoint_eigvals.md
new file mode 100644
index 00000000000..3dc968afa13
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.self_adjoint_eigvals.md
@@ -0,0 +1,15 @@
+### `tf.self_adjoint_eigvals(matrix, name=None)` {#self_adjoint_eigvals}
+
+Computes the eigenvalues a self-adjoint  matrix.
+
+##### Args:
+
+
+*  <b>`matrix`</b>: `Tensor` of shape `[N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`e`</b>: Eigenvalues of `matrix`. Shape is `[N]`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sub.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sub.md
index 2d1da0f0b98..83dbd7a93c8 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sub.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard4/tf.sub.md
@@ -2,6 +2,9 @@
 
 Returns x - y element-wise.
 
+*NOTE*: `Sub` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.add.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.add.md
index 738f0337d30..da82da60762 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.add.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.add.md
@@ -2,7 +2,8 @@
 
 Returns x + y element-wise.
 
-*NOTE*: Add supports broadcasting. AddN does not.
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.assert_integer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.assert_integer.md
index c75ba587658..bc18c0a8f9b 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.assert_integer.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.assert_integer.md
@@ -1,4 +1,4 @@
-### `tf.assert_integer(x, data=None, summarize=None, name=None)` {#assert_integer}
+### `tf.assert_integer(x, message=None, name=None)` {#assert_integer}
 
 Assert that `x` is of integer dtype.
 
@@ -19,12 +19,15 @@ x = tf.with_dependencies([tf.assert_integer(x)], x)
 
 
 *  <b>`x`</b>: `Tensor` whose basetype is integer and is not quantized.
-*  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
-    error message and first few entries of `x`.
-*  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_integer".
 
+##### Raises:
+
+
+*  <b>`TypeError`</b>: If `x.dtype` is anything other than non-quantized integer.
+
 ##### Returns:
 
-  Op that raises `InvalidArgumentError` if `x == y` is False.
+  A `no_op` that does nothing.  Type can be determined statically.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.bypass.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.bypass.md
new file mode 100644
index 00000000000..976d579cd64
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.bypass.md
@@ -0,0 +1,21 @@
+### `tf.contrib.graph_editor.bypass(sgv)` {#bypass}
+
+Bypass the given subgraph by connecting its inputs to its outputs.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be bypassed. This argument is converted to a
+    subgraph using the same rules than the function subgraph.make_view.
+
+##### Returns:
+
+  A new subgraph view of the bypassed subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.reroute_a2b.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.reroute_a2b.md
new file mode 100644
index 00000000000..4a4cecc26c2
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.reroute_a2b.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.reroute_a2b(sgv0, sgv1)` {#reroute_a2b}
+
+Re-route the inputs and outputs of sgv0 to sgv1 (see _reroute).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.reroute_b2a_inputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.reroute_b2a_inputs.md
new file mode 100644
index 00000000000..46a82bdad96
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.graph_editor.reroute_b2a_inputs.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.reroute_b2a_inputs(sgv0, sgv1)` {#reroute_b2a_inputs}
+
+Re-route all the inputs of sgv1 to sgv0 (see reroute_inputs).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.TensorFlowDNNClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.TensorFlowDNNClassifier.md
index 8fd75ed89cf..ab811506714 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.TensorFlowDNNClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.TensorFlowDNNClassifier.md
@@ -31,56 +31,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowDNNClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowDNNClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.CheckpointSaver.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.CheckpointSaver.md
index 07e073ced5a..58db246a0ca 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.CheckpointSaver.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.contrib.learn.monitors.CheckpointSaver.md
@@ -1,38 +1,32 @@
 Saves checkpoints every N steps.
 - - -
 
-#### `tf.contrib.learn.monitors.CheckpointSaver.__init__(every_n_steps, saver, checkpoint_dir, checkpoint_basename='model.ckpt', first_n_steps=-1)` {#CheckpointSaver.__init__}
+#### `tf.contrib.learn.monitors.CheckpointSaver.__init__(checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename='model.ckpt', scaffold=None)` {#CheckpointSaver.__init__}
 
 Initialize CheckpointSaver monitor.
 
 ##### Args:
 
 
-*  <b>`every_n_steps`</b>: `int`, save every N steps.
-*  <b>`saver`</b>: `Saver` object, used for saving.
 *  <b>`checkpoint_dir`</b>: `str`, base directory for the checkpoint files.
+*  <b>`save_secs`</b>: `int`, save every N secs.
+*  <b>`save_steps`</b>: `int`, save every N steps.
+*  <b>`saver`</b>: `Saver` object, used for saving.
 *  <b>`checkpoint_basename`</b>: `str`, base name for the checkpoint files.
-*  <b>`first_n_steps`</b>: `int`, if positive, save every step during the
-    first `first_n_steps` steps.
+*  <b>`scaffold`</b>: `Scaffold`, use to get saver object.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If both `save_steps` and `save_secs` are not `None`.
+*  <b>`ValueError`</b>: If both `save_steps` and `save_secs` are `None`.
 
 
 - - -
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.begin(max_steps=None)` {#CheckpointSaver.begin}
 
-Called at the beginning of training.
 
-When called, the default graph is the one we are executing.
-
-##### Args:
-
-
-*  <b>`max_steps`</b>: `int`, the maximum global step this training will run until.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if we've already begun a run.
 
 
 - - -
@@ -76,55 +70,6 @@ End epoch.
 *  <b>`ValueError`</b>: if we've not begun an epoch, or `epoch` number does not match.
 
 
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.every_n_post_step(step, session)` {#CheckpointSaver.every_n_post_step}
-
-
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.every_n_step_begin(step)` {#CheckpointSaver.every_n_step_begin}
-
-Callback before every n'th step begins.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list` of tensors that will be evaluated at this step.
-
-
-- - -
-
-#### `tf.contrib.learn.monitors.CheckpointSaver.every_n_step_end(step, outputs)` {#CheckpointSaver.every_n_step_end}
-
-Callback after every n'th step finished.
-
-This callback provides access to the tensors/ops evaluated at this step,
-including the additional tensors for which evaluation was requested in
-`step_begin`.
-
-In addition, the callback has the opportunity to stop training by returning
-`True`. This is useful for early stopping, for example.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-*  <b>`outputs`</b>: `dict` mapping `string` values representing tensor names to
-    the value resulted from running these tensors. Values may be either
-    scalars, for scalar tensors, or Numpy `array`, for non-scalar tensors.
-
-##### Returns:
-
-  `bool`. True if training should stop.
-
-
 - - -
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.post_step(step, session)` {#CheckpointSaver.post_step}
@@ -160,33 +105,24 @@ A setter called automatically by the target estimator.
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.step_begin(step)` {#CheckpointSaver.step_begin}
 
-Overrides `BaseMonitor.step_begin`.
 
-When overriding this method, you must call the super implementation.
-
-##### Args:
-
-
-*  <b>`step`</b>: `int`, the current value of the global step.
-
-##### Returns:
-
-  A `list`, the result of every_n_step_begin, if that was called this step,
-  or an empty list otherwise.
-
-##### Raises:
-
-
-*  <b>`ValueError`</b>: if called more than once during a step.
 
 
 - - -
 
 #### `tf.contrib.learn.monitors.CheckpointSaver.step_end(step, output)` {#CheckpointSaver.step_end}
 
-Overrides `BaseMonitor.step_end`.
+Callback after training step finished.
 
-When overriding this method, you must call the super implementation.
+This callback provides access to the tensors/ops evaluated at this step,
+including the additional tensors for which evaluation was requested in
+`step_begin`.
+
+In addition, the callback has the opportunity to stop training by returning
+`True`. This is useful for early stopping, for example.
+
+Note that this method is not called if the call to `Session.run()` that
+followed the last call to `step_begin()` failed.
 
 ##### Args:
 
@@ -198,7 +134,11 @@ When overriding this method, you must call the super implementation.
 
 ##### Returns:
 
-  `bool`, the result of every_n_step_end, if that was called this step,
-  or `False` otherwise.
+  `bool`. True if training should stop.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if we've not begun a step, or `step` number does not match.
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.div.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.div.md
index 92eba7927a0..61616c0e6b2 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.div.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.div.md
@@ -2,6 +2,9 @@
 
 Returns x / y element-wise.
 
+*NOTE*: `Div` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.logical_and.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.logical_and.md
index dd5b563c8ba..2b5f011ccdc 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.logical_and.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.logical_and.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of x AND y element-wise.
 
+*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.variable_op_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.variable_op_scope.md
index 709d2375b50..4ed0f567ffa 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.variable_op_scope.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard5/tf.variable_op_scope.md
@@ -1,4 +1,4 @@
-### `tf.variable_op_scope(values, name_or_scope, default_name=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, reuse=None)` {#variable_op_scope}
+### `tf.variable_op_scope(values, name_or_scope, default_name=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, reuse=None, dtype=None)` {#variable_op_scope}
 
 Returns a context manager for defining an op that creates variables.
 
@@ -42,6 +42,8 @@ def my_op_with_vars(a, b, scope=None):
 *  <b>`custom_getter`</b>: The default custom getter for variables within this scope.
 *  <b>`reuse`</b>: `True` or `None`; if `True`, we go into reuse mode for this scope as
     well as all sub-scopes; if `None`, we just inherit the parent scope reuse.
+*  <b>`dtype`</b>: The default type of variables created in this scope, defaults to the
+    type of the parent scope.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.assert_positive.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.assert_positive.md
index 8b727d62150..1be22381d26 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.assert_positive.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.assert_positive.md
@@ -1,4 +1,4 @@
-### `tf.assert_positive(x, data=None, summarize=None, name=None)` {#assert_positive}
+### `tf.assert_positive(x, data=None, summarize=None, message=None, name=None)` {#assert_positive}
 
 Assert the condition `x > 0` holds element-wise.
 
@@ -25,6 +25,7 @@ If `x` is empty this is trivially satisfied.
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_positive".
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md
index aa40420ff83..df1b3d32e6e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Beta.md
@@ -67,20 +67,20 @@ Initialize a batch of Beta distributions.
 ##### Args:
 
 
-*  <b>`a`</b>: Positive `float` or `double` tensor with shape broadcastable to
+*  <b>`a`</b>: Positive floating point tensor with shape broadcastable to
     `[N1,..., Nm]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
      different Beta distributions. This also defines the
      dtype of the distribution.
-*  <b>`b`</b>: Positive `float` or `double` tensor with shape broadcastable to
+*  <b>`b`</b>: Positive floating point tensor with shape broadcastable to
     `[N1,..., Nm]` `m >= 0`.  Defines this as a batch of `N1 x ... x Nm`
      different Beta distributions.
 *  <b>`validate_args`</b>: Whether to assert valid values for parameters `a` and `b`,
-    and `x` in `prob` and `log_prob`.  If False, correct behavior is not
+    and `x` in `prob` and `log_prob`.  If `False`, correct behavior is not
     guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to prefix Ops created by this distribution class.
 
 
@@ -242,7 +242,7 @@ Log of the probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float` or `double`, tensor whose shape can
+*  <b>`x`</b>: Non-negative floating point tensor whose shape can
     be broadcast with `self.a` and `self.b`.  For fixed leading
     dimensions, the last dimension represents counts for the corresponding
     Beta distribution in `self.a` and `self.b`. `x` is only legal if
@@ -312,7 +312,7 @@ The probability mass function.
 ##### Args:
 
 
-*  <b>`x`</b>: Non-negative `float`, `double` tensor whose shape can
+*  <b>`x`</b>: Non-negative floating point tensor whose shape can
     be broadcast with `self.a` and `self.b`.  For fixed leading
     dimensions, the last dimension represents x for the corresponding Beta
     distribution in `self.a` and `self.b`. `x` is only legal if is
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md
index 273e23714fe..815e544a063 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.distributions.Laplace.md
@@ -20,17 +20,17 @@ broadcasting (e.g., `loc / scale` is a valid operation).
 ##### Args:
 
 
-*  <b>`loc`</b>: `float` or `double` tensor which characterizes the location (center)
+*  <b>`loc`</b>: Floating point tensor which characterizes the location (center)
     of the distribution.
-*  <b>`scale`</b>: `float` or `double`, positive-valued tensor which characterzes the
-    spread of the distribution.
+*  <b>`scale`</b>: Positive floating point tensor which characterizes the spread of
+    the distribution.
 *  <b>`validate_args`</b>: Whether to validate input with asserts.  If `validate_args`
     is `False`, and the inputs are invalid, correct behavior is not
     guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.deprecated.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.deprecated.md
index 15924febeed..2daecf41e27 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.deprecated.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.framework.deprecated.md
@@ -2,15 +2,14 @@
 
 Decorator for marking functions or methods deprecated.
 
-This decorator adds a deprecation warning to a function's docstring. It has
-the following format:
+This decorator logs a deprecation warning whenever the decorated function is
+called. It has the following format:
 
   <function> (from <module>) is deprecated and will be removed after <date>.
   Instructions for updating:
   <instructions>
 
-whenever the decorated function is called. <function> will include the class
-name if it is a method.
+<function> will include the class name if it is a method.
 
 It also edits the docstring of the function: ' (deprecated)' is appended
 to the first line of the docstring and a deprecation notice is prepended
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.ts.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.ts.md
new file mode 100644
index 00000000000..9239a5a3dca
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.contrib.graph_editor.ts.md
@@ -0,0 +1,30 @@
+### `tf.contrib.graph_editor.ts(*args, **kwargs)` {#ts}
+
+Helper to select tensors.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Tensor. tf.Operation instances are silently ignored.
+*  <b>`**kwargs`</b>: 'graph': tf.Graph in which to perform the regex query.This is
+    required when using regex.
+    'positive_filter': an elem if selected only if positive_filter(elem) is
+      True. This is optional.
+    'restrict_ts_regex': a regular expression is ignored if it doesn't start
+      with the substring "(?#ts)".
+
+##### Returns:
+
+  list of tf.Tensor
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Tensor
+    or an (array of) tf.Operation (silently ignored) or a string
+    or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected or if a regular
+    expression is used without passing a graph as a keyword argument.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.cumsum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.cumsum.md
index 64a8312fde0..baa00e57d53 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.cumsum.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.cumsum.md
@@ -30,8 +30,8 @@ tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
 
 
 *  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-   `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-   `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+     `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+     `complex128`, `qint8`, `quint8`, `qint32`, `half`.
 *  <b>`axis`</b>: A `Tensor` of type `int32` (default: 0).
 *  <b>`reverse`</b>: A `bool` (default: False).
 *  <b>`name`</b>: A name for the operation (optional).
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.maximum.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.maximum.md
index 309946f4352..aec816dcbad 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.maximum.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.maximum.md
@@ -1,6 +1,9 @@
 ### `tf.maximum(x, y, name=None)` {#maximum}
 
-Returns the max of x and y (i.e. x > y ? x : y) element-wise, broadcasts.
+Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+
+*NOTE*: `Maximum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.separable_conv2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.separable_conv2d.md
index 23a772c7125..a42cabb8a8f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.separable_conv2d.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.nn.separable_conv2d.md
@@ -32,7 +32,8 @@ horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
 *  <b>`strides`</b>: 1-D of size 4.  The strides for the depthwise convolution for
     each dimension of `input`.
 *  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`.  The padding algorithm.
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+    See the [comment
+      here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
 *  <b>`name`</b>: A name for this operation (optional).
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.self_adjoint_eig.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.self_adjoint_eig.md
index efbc0cd3be9..48cd3b0575a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.self_adjoint_eig.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.self_adjoint_eig.md
@@ -1,21 +1,20 @@
-### `tf.self_adjoint_eig(input, name=None)` {#self_adjoint_eig}
+### `tf.self_adjoint_eig(matrix, name=None)` {#self_adjoint_eig}
 
-Calculates the Eigen Decomposition of a square Self-Adjoint matrix.
+Computes the eigen decomposition of a self-adjoint matrix.
 
-Only the lower-triangular part of the input will be used in this case. The
-upper-triangular part will not be read.
-
-The result is a M+1 x M matrix whose first row is the eigenvalues, and
-subsequent rows are eigenvectors.
+Computes the eigenvalues and eigenvectors of an N-by-N matrix `matrix` such
+that `matrix * v[:,i] = e(i) * v[:,i]`, for i=0...N-1.
 
 ##### Args:
 
 
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`.
-    Shape is `[M, M]`.
-*  <b>`name`</b>: A name for the operation (optional).
+*  <b>`matrix`</b>: `Tensor` of shape `[N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
 
 ##### Returns:
 
-  A `Tensor`. Has the same type as `input`. Shape is `[M+1, M]`.
+
+*  <b>`e`</b>: Eigenvalues. Shape is `[N]`.
+*  <b>`v`</b>: Eigenvectors. Shape is `[N, N]`. The columns contain the eigenvectors of
+    `matrix`.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Coordinator.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Coordinator.md
index 744e4e233a6..27ae6f13e3a 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Coordinator.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.Coordinator.md
@@ -120,11 +120,14 @@ After this is called, calls to `should_stop()` will return `False`.
 
 - - -
 
-#### `tf.train.Coordinator.join(threads, stop_grace_period_secs=120)` {#Coordinator.join}
+#### `tf.train.Coordinator.join(threads=None, stop_grace_period_secs=120)` {#Coordinator.join}
 
 Wait for threads to terminate.
 
-Blocks until all `threads` have terminated or `request_stop()` is called.
+This call blocks until a set of threads have terminated.  The set of thread
+is the union of the threads passed in the `threads` argument and the list
+of threads that registered with the coordinator by calling
+`Coordinator.register_thread()`.
 
 After the threads stop, if an `exc_info` was passed to `request_stop`, that
 exception is re-raised.
@@ -138,7 +141,8 @@ that `RuntimeError`.
 ##### Args:
 
 
-*  <b>`threads`</b>: List of `threading.Threads`. The started threads to join.
+*  <b>`threads`</b>: List of `threading.Threads`. The started threads to join in
+    addition to the registered threads.
 *  <b>`stop_grace_period_secs`</b>: Number of seconds given to threads to stop after
     `request_stop()` has been called.
 
@@ -156,6 +160,18 @@ that `RuntimeError`.
 
 
 
+- - -
+
+#### `tf.train.Coordinator.register_thread(thread)` {#Coordinator.register_thread}
+
+Register a thread to join.
+
+##### Args:
+
+
+*  <b>`thread`</b>: A Python thread to join.
+
+
 - - -
 
 #### `tf.train.Coordinator.request_stop(ex=None)` {#Coordinator.request_stop}
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.exponential_decay.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.exponential_decay.md
index d90c8ee7269..42d8f100769 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.exponential_decay.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard6/tf.train.exponential_decay.md
@@ -28,7 +28,7 @@ learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                            100000, 0.96, staircase=True)
 # Passing global_step to minimize() will increment it at each step.
 learning_step = (
-    tf.GradientDescentOptimizer(learning_rate)
+    tf.train.GradientDescentOptimizer(learning_rate)
     .minimize(...my loss..., global_step=global_step)
 )
 ```
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.assert_equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.assert_equal.md
index ea4fd3a1fd3..d855dd857ad 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.assert_equal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.assert_equal.md
@@ -1,4 +1,4 @@
-### `tf.assert_equal(x, y, data=None, summarize=None, name=None)` {#assert_equal}
+### `tf.assert_equal(x, y, data=None, summarize=None, message=None, name=None)` {#assert_equal}
 
 Assert the condition `x == y` holds element-wise.
 
@@ -27,6 +27,7 @@ If both `x` and `y` are empty, this is trivially satisfied.
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`, `y`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_equal".
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.batch_cholesky.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.batch_cholesky.md
index 487680f50b8..1ce7fca603d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.batch_cholesky.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.batch_cholesky.md
@@ -1,6 +1,6 @@
 ### `tf.batch_cholesky(input, name=None)` {#batch_cholesky}
 
-Calculates the Cholesky decomposition of a batch of square matrices.
+Computes the Cholesky decomposition of a batch of square matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices, with the same constraints as the single matrix Cholesky
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md
index e056b6a5ad8..3b1715d88c2 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.MultivariateNormalFull.md
@@ -5,14 +5,12 @@ Evaluation of the pdf, determinant, and sampling are all `O(k^3)` operations.
 
 #### Mathematical details
 
-The PDF of this distribution is:
+With `C = sigma`, the PDF of this distribution is:
 
 ```
-f(x) = (2*pi)^(-k/2) |det(sigma)|^(-1/2) exp(-1/2*(x-mu)^*.sigma^{-1}.(x-mu))
+f(x) = (2 pi)^(-k/2) |det(C)|^(-1/2) exp(-1/2 (x - mu)^T C^{-1} (x - mu))
 ```
 
-where `.` denotes the inner product on `R^k` and `^*` denotes transpose.
-
 #### Examples
 
 A single multi-variate Gaussian distribution is defined by a vector of means
@@ -50,17 +48,17 @@ User must provide means `mu` and `sigma`, the mean and covariance.
 ##### Args:
 
 
-*  <b>`mu`</b>: `(N+1)-D`  `float` or `double` tensor with shape `[N1,...,Nb, k]`,
+*  <b>`mu`</b>: `(N+1)-D` floating point tensor with shape `[N1,...,Nb, k]`,
     `b >= 0`.
 *  <b>`sigma`</b>: `(N+2)-D` `Tensor` with same `dtype` as `mu` and shape
-    `[N1,...,Nb, k, k]`.
+    `[N1,...,Nb, k, k]`.  Each batch member must be positive definite.
 *  <b>`validate_args`</b>: Whether to validate input with asserts.  If `validate_args`
     is `False`, and the inputs are invalid, correct behavior is not
     guaranteed.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+*  <b>`allow_nan_stats`</b>: `Boolean`, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
@@ -73,7 +71,7 @@ User must provide means `mu` and `sigma`, the mean and covariance.
 
 #### `tf.contrib.distributions.MultivariateNormalFull.allow_nan_stats` {#MultivariateNormalFull.allow_nan_stats}
 
-Boolean describing behavior when a stat is undefined for batch member.
+`Boolean` describing behavior when stats are undefined.
 
 
 - - -
@@ -342,7 +340,7 @@ Standard deviation of the distribution.
 
 #### `tf.contrib.distributions.MultivariateNormalFull.validate_args` {#MultivariateNormalFull.validate_args}
 
-Boolean describing behavior on invalid input.
+`Boolean` describing behavior on invalid input.
 
 
 - - -
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md
index 3826c2812f2..159e477f03f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.Normal.md
@@ -52,15 +52,15 @@ broadcasting (e.g. `mu + sigma` is a valid operation).
 ##### Args:
 
 
-*  <b>`mu`</b>: `float` or `double` tensor, the means of the distribution(s).
-*  <b>`sigma`</b>: `float` or `double` tensor, the stddevs of the distribution(s).
+*  <b>`mu`</b>: Floating point tensor, the means of the distribution(s).
+*  <b>`sigma`</b>: Floating point tensor, the stddevs of the distribution(s).
     sigma must contain only positive values.
 *  <b>`validate_args`</b>: Whether to assert that `sigma > 0`. If `validate_args` is
-    False, correct output is not guaranteed when input is invalid.
-*  <b>`allow_nan_stats`</b>: Boolean, default False.  If False, raise an exception if
-    a statistic (e.g. mean/mode/etc...) is undefined for any batch member.
-    If True, batch members with valid parameters leading to undefined
-    statistics will return NaN for this statistic.
+    `False`, correct output is not guaranteed when input is invalid.
+*  <b>`allow_nan_stats`</b>: Boolean, default `False`.  If `False`, raise an
+    exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+    batch member.  If `True`, batch members with valid parameters leading to
+    undefined statistics will return NaN for this statistic.
 *  <b>`name`</b>: The name to give Ops created by the initializer.
 
 ##### Raises:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md
index 62f0a904016..014d2792b6b 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.distributions.kl.md
@@ -7,9 +7,9 @@ Get the KL-divergence KL(dist_a || dist_b).
 
 *  <b>`dist_a`</b>: instance of distributions.Distribution.
 *  <b>`dist_b`</b>: instance of distributions.Distribution.
-*  <b>`allow_nan`</b>: If False (default), a runtime error is raised
+*  <b>`allow_nan`</b>: If `False` (default), a runtime error is raised
     if the KL returns NaN values for any batch entry of the given
-    distributions.  If True, the KL may return a NaN for the given entry.
+    distributions.  If `True`, the KL may return a NaN for the given entry.
 *  <b>`name`</b>: (optional) Name scope to use for created operations.
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.ph.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.ph.md
new file mode 100644
index 00000000000..c765240585a
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.ph.md
@@ -0,0 +1,20 @@
+### `tf.contrib.graph_editor.ph(dtype, shape=None, scope=None)` {#ph}
+
+Create a tf.placeholder for the Graph Editor.
+
+Note that the correct graph scope must be set by the calling function.
+The placeholder is named using the function placeholder_name (with no
+tensor argument).
+
+##### Args:
+
+
+*  <b>`dtype`</b>: the tensor type.
+*  <b>`shape`</b>: the tensor shape (optional).
+*  <b>`scope`</b>: absolute scope within which to create the placeholder. None
+    means that the scope of t is preserved. "" means the root scope.
+
+##### Returns:
+
+  A newly created tf.placeholder.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.sgv.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.sgv.md
new file mode 100644
index 00000000000..36b4de6315d
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.graph_editor.sgv.md
@@ -0,0 +1,25 @@
+### `tf.contrib.graph_editor.sgv(*args, **kwargs)` {#sgv}
+
+Create a SubGraphView from selected operations and passthrough tensors.
+
+##### Args:
+
+
+*  <b>`*args`</b>: list of 1) regular expressions (compiled or not) or  2) (array of)
+    tf.Operation 3) (array of) tf.Tensor. Those objects will be converted
+    into a list of operations and a list of candidate for passthrough tensors.
+*  <b>`**kwargs`</b>: keyword graph is used 1) to check that the ops and ts are from
+    the correct graph 2) for regular expression query
+
+##### Returns:
+
+  A subgraph view.
+
+##### Raises:
+
+
+*  <b>`TypeError`</b>: if the optional keyword argument graph is not a tf.Graph
+    or if an argument in args is not an (array of) tf.Tensor
+    or an (array of) tf.Operation or a string or a regular expression.
+*  <b>`ValueError`</b>: if one of the keyword arguments is unexpected.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.learn.TensorFlowClassifier.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.learn.TensorFlowClassifier.md
index e9c910ac619..2318f59670f 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.learn.TensorFlowClassifier.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.contrib.learn.TensorFlowClassifier.md
@@ -31,56 +31,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowClassifier.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowClassifier.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.less.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.less.md
index 8791d0366aa..3a00afa8db5 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.less.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.less.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of (x < y) element-wise.
 
+*NOTE*: `Less` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.logical_or.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.logical_or.md
index be18e65e92e..e04b6a15d2c 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.logical_or.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.logical_or.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of x OR y element-wise.
 
+*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.variable_scope.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.variable_scope.md
index 72c790f627e..9c89cd4fb34 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.variable_scope.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard7/tf.variable_scope.md
@@ -1,4 +1,4 @@
-### `tf.variable_scope(name_or_scope, reuse=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None)` {#variable_scope}
+### `tf.variable_scope(name_or_scope, reuse=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, dtype=None)` {#variable_scope}
 
 Returns a context for variable scope.
 
@@ -69,6 +69,8 @@ then all its sub-scopes become reusing as well.
 *  <b>`caching_device`</b>: default caching device for variables within this scope.
 *  <b>`partitioner`</b>: default partitioner for variables within this scope.
 *  <b>`custom_getter`</b>: default custom getter for variables within this scope.
+*  <b>`dtype`</b>: type of variables created in this scope (defaults to the type
+    in the passed scope, or inherited from parent scope).
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Session.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Session.md
index fb9ff9834f2..ce09a8f8cca 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Session.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.Session.md
@@ -80,32 +80,55 @@ the session constructor.
 
 #### `tf.Session.run(fetches, feed_dict=None, options=None, run_metadata=None)` {#Session.run}
 
-Runs the operations and evaluates the tensors in `fetches`.
+Runs operations and evaluates tensors in `fetches`.
 
 This method runs one "step" of TensorFlow computation, by
 running the necessary graph fragment to execute every `Operation`
 and evaluate every `Tensor` in `fetches`, substituting the values in
 `feed_dict` for the corresponding input values.
 
-The `fetches` argument may be a single graph element, an arbitrarily nested
-list of graph elements, or a dictionary whose values are the above. The type
-of `fetches` determines the return value of this method. A graph element can
-be one of the following types:
+The `fetches` argument may be a single graph element, or an arbitrarily
+nested list, tuple, namedtuple, or dict containing graph elements at its
+leaves.  A graph element can be one of the following types:
 
-* If an element of `fetches` is an
-  [`Operation`](../../api_docs/python/framework.md#Operation), the
-  corresponding fetched value will be `None`.
-* If an element of `fetches` is a
-  [`Tensor`](../../api_docs/python/framework.md#Tensor), the corresponding
-  fetched value will be a numpy ndarray containing the value of that tensor.
-* If an element of `fetches` is a
-  [`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor),
-  the corresponding fetched value will be a
+* An [`Operation`](../../api_docs/python/framework.md#Operation).
+  The corresponding fetched value will be `None`.
+* A [`Tensor`](../../api_docs/python/framework.md#Tensor).
+  The corresponding fetched value will be a numpy ndarray containing the
+  value of that tensor.
+* A [`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor).
+  The corresponding fetched value will be a
   [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue)
   containing the value of that sparse tensor.
-* If an element of `fetches` is produced by a `get_tensor_handle` op,
-  the corresponding fetched value will be a numpy ndarray containing the
-  handle of that tensor.
+* A `get_tensor_handle` op.  The corresponding fetched value will be a
+  numpy ndarray containing the handle of that tensor.
+* A `string` which is the name of a tensor or operation in the graph.
+
+The value returned by `run()` has the same shape as the `fetches` argument,
+where the leaves are replaced by the corresponding values returned by
+TensorFlow.
+
+Example:
+
+```python
+   a = tf.constant([10, 20])
+   b = tf.constant([1.0, 2.0])
+   # 'fetches' can be a singleton
+   v = session.run(a)
+   # v is the numpy array [10, 20]
+   # 'fetches' can be a list.
+   v = session.run([a, b])
+   # v a Python list with 2 numpy arrays: the numpy array [10, 20] and the
+   # 1-D array [1.0, 2.0]
+   # 'fetches' can be arbitrary lists, tuples, namedtuple, dicts:
+   MyData = collections.namedtuple('MyData', ['a', 'b'])
+   v = session.run({'k1': MyData(a, b), 'k2': [b, a]})
+   # v is a dict with
+   # v['k1'] is a MyData namedtuple with 'a' the numpy array [10, 20] and
+   # 'b' the numpy array [1.0, 2.0]
+   # v['k2'] is a list with the numpy array [1.0, 2.0] and the numpy array
+   # [10, 20].
+```
 
 The optional `feed_dict` argument allows the caller to override
 the value of tensors in the graph. Each key in `feed_dict` can be
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assert_negative.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assert_negative.md
index 81daebec0da..b06c50fad47 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assert_negative.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.assert_negative.md
@@ -1,4 +1,4 @@
-### `tf.assert_negative(x, data=None, summarize=None, name=None)` {#assert_negative}
+### `tf.assert_negative(x, data=None, summarize=None, message=None, name=None)` {#assert_negative}
 
 Assert the condition `x < 0` holds element-wise.
 
@@ -25,6 +25,7 @@ If `x` is empty this is trivially satisfied.
 *  <b>`data`</b>: The tensors to print out if the condition is False.  Defaults to
     error message and first few entries of `x`.
 *  <b>`summarize`</b>: Print this many entries of each tensor.
+*  <b>`message`</b>: A string to prefix to the default message.
 *  <b>`name`</b>: A name for this operation (optional).  Defaults to "assert_negative".
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.Transformer.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.Transformer.md
new file mode 100644
index 00000000000..d070c982f1d
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.Transformer.md
@@ -0,0 +1,51 @@
+Transform a subgraph into another one.
+
+By default, the constructor create a transform which copy a subgraph and
+replaces inputs with placeholders. This behavior can be modified by changing
+the handlers.
+- - -
+
+#### `tf.contrib.graph_editor.Transformer.__init__()` {#Transformer.__init__}
+
+Transformer constructor.
+
+The following members can be modified:
+transform_op_handler: handle the transformation of a tf.Operation.
+  This handler defaults to a simple copy.
+assign_collections_handler: handle the assignment of collections.
+  This handler defaults to assigning new collections created under the
+  given name-scope.
+transform_input_handler: handle the transform of the inputs to the given
+  subgraph. This handler defaults to creating placeholders instead of the
+  ops just before the input tensors of the subgraph.
+transform_hidden_input_handler: handle the transform of the hidden inputs of
+  the subgraph, that is, the inputs which are not listed in sgv.inputs.
+  This handler defaults to a transform which keep the same input if the
+  source and destination graphs are the same, otherwise use placeholders.
+transform_original_op_hanlder: handle the transform of original_op. This
+  handler defaults to transforming original_op only if they are in the
+  subgraph, otherwise they are ignored.
+
+
+- - -
+
+#### `tf.contrib.graph_editor.Transformer.new_name(name)` {#Transformer.new_name}
+
+Compute a destination name from a source name.
+
+##### Args:
+
+
+*  <b>`name`</b>: the name to be "transformed".
+
+##### Returns:
+
+  the transformed name.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: if the source scope is used (that is, not an empty string)
+    and the source name does not belong to the source scope.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.detach.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.detach.md
new file mode 100644
index 00000000000..e04134d548e
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.detach.md
@@ -0,0 +1,28 @@
+### `tf.contrib.graph_editor.detach(sgv, control_inputs=False, control_outputs=None, control_ios=None)` {#detach}
+
+Detach both the inputs and the outputs of a subgraph view.
+
+##### Args:
+
+
+*  <b>`sgv`</b>: the subgraph view to be detached. This argument is converted to a
+    subgraph using the same rules as the function subgraph.make_view.
+*  <b>`control_inputs`</b>: A boolean indicating whether control inputs are enabled.
+*  <b>`control_outputs`</b>: An instance of util.ControlOutputs or None. If not None,
+    control outputs are enabled.
+*  <b>`control_ios`</b>: An instance of util.ControlOutputs or None. If not None, both
+    control inputs and control outputs are enabled. This is equivalent to set
+    control_inputs to True and control_outputs to the util.ControlOutputs
+    instance.
+
+##### Returns:
+
+  A new subgraph view of the detached subgraph.
+    Note that sgv is also modified in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.reroute_b2a_outputs.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.reroute_b2a_outputs.md
new file mode 100644
index 00000000000..b14ea3485b0
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.reroute_b2a_outputs.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.reroute_b2a_outputs(sgv0, sgv1)` {#reroute_b2a_outputs}
+
+Re-route all the outputs of sgv1 to sgv0 (see _reroute_outputs).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.swap.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.swap.md
new file mode 100644
index 00000000000..d6fab641cc7
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.graph_editor.swap.md
@@ -0,0 +1,4 @@
+### `tf.contrib.graph_editor.swap(sgv0, sgv1)` {#swap}
+
+Swap the inputs and outputs of sgv1 to sgv0 (see _reroute).
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.max_pool2d.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.max_pool2d.md
index 5dd8fbf68d2..a4308c0f468 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.max_pool2d.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.layers.max_pool2d.md
@@ -1,30 +1,29 @@
 ### `tf.contrib.layers.max_pool2d(*args, **kwargs)` {#max_pool2d}
 
-Adds a Max Pooling op.
+Adds a 2D Max Pooling op.
 
-It is assumed by the wrapper that the pooling is only done per image and not
-in depth or batch.
+It is assumed that the pooling is done per image but not in batch or channels.
 
 ##### Args:
 
 
-*  <b>`inputs`</b>: a tensor of size [batch_size, height, width, depth].
-*  <b>`kernel_size`</b>: a list of length 2: [kernel_height, kernel_width] of the
+*  <b>`inputs`</b>: A `Tensor` of size [batch_size, height, width, channels].
+*  <b>`kernel_size`</b>: A list of length 2: [kernel_height, kernel_width] of the
     pooling kernel over which the op is computed. Can be an int if both
     values are the same.
-*  <b>`stride`</b>: a list of length 2: [stride_height, stride_width].
-    Can be an int if both strides are the same.  Note that presently
+*  <b>`stride`</b>: A list of length 2: [stride_height, stride_width].
+    Can be an int if both strides are the same. Note that presently
     both strides must have the same value.
-*  <b>`padding`</b>: the padding method, either 'VALID' or 'SAME'.
-*  <b>`outputs_collections`</b>: collection to add the outputs.
+*  <b>`padding`</b>: The padding method, either 'VALID' or 'SAME'.
+*  <b>`outputs_collections`</b>: The collections to which the outputs are added.
 *  <b>`scope`</b>: Optional scope for op_scope.
 
 ##### Returns:
 
-  a tensor representing the results of the pooling operation.
+  A `Tensor` representing the results of the pooling operation.
 
 ##### Raises:
 
 
-*  <b>`ValueError`</b>: if 'kernel_size' is not a 2-D list
+*  <b>`ValueError`</b>: If 'kernel_size' is not a 2-D list
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.TensorFlowRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.TensorFlowRegressor.md
index 8dcf209b03a..9424e537c40 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.TensorFlowRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.learn.TensorFlowRegressor.md
@@ -31,56 +31,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.LSTMFusedCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.LSTMFusedCell.md
new file mode 100644
index 00000000000..b1e9fde7160
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.contrib.rnn.LSTMFusedCell.md
@@ -0,0 +1,60 @@
+Basic LSTM recurrent network cell.
+
+The implementation is based on: http://arxiv.org/abs/1409.2329.
+
+We add forget_bias (default: 1) to the biases of the forget gate in order to
+reduce the scale of forgetting in the beginning of the training.
+
+Unlike BasicLSTMCell, this is a monolithic op and should be much faster. The
+weight and bias matrixes should be compatible as long as the variabel scope
+matches.
+- - -
+
+#### `tf.contrib.rnn.LSTMFusedCell.__init__(num_units, forget_bias=1.0, use_peephole=False)` {#LSTMFusedCell.__init__}
+
+Initialize the basic LSTM cell.
+
+##### Args:
+
+
+*  <b>`num_units`</b>: int, The number of units in the LSTM cell.
+*  <b>`forget_bias`</b>: float, The bias added to forget gates (see above).
+*  <b>`use_peephole`</b>: Whether to use peephole connections or not.
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMFusedCell.output_size` {#LSTMFusedCell.output_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMFusedCell.state_size` {#LSTMFusedCell.state_size}
+
+
+
+
+- - -
+
+#### `tf.contrib.rnn.LSTMFusedCell.zero_state(batch_size, dtype)` {#LSTMFusedCell.zero_state}
+
+Return zero-filled state tensor(s).
+
+##### Args:
+
+
+*  <b>`batch_size`</b>: int, float, or unit Tensor representing the batch size.
+*  <b>`dtype`</b>: the data type to use for the state.
+
+##### Returns:
+
+  If `state_size` is an int or TensorShape, then the return value is a
+  `N-D` tensor of shape `[batch_size x state_size]` filled with zeros.
+
+  If `state_size` is a nested list or tuple, then the return value is
+  a nested list or tuple (of the same structure) of `2-D` tensors with
+the shapes `[batch_size x s]` for each s in `state_size`.
+
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.equal.md
index 998db9189ff..332a12f7255 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.equal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.equal.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of (x == y) element-wise.
 
+*NOTE*: `Equal` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.less_equal.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.less_equal.md
index 65d7eb50842..c8ce84b6691 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.less_equal.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.less_equal.md
@@ -2,6 +2,9 @@
 
 Returns the truth value of (x <= y) element-wise.
 
+*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.matrix_inverse.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.matrix_inverse.md
index 4172badef50..1edc4a9ec9e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.matrix_inverse.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.matrix_inverse.md
@@ -1,6 +1,6 @@
 ### `tf.matrix_inverse(input, adjoint=None, name=None)` {#matrix_inverse}
 
-Calculates the inverse of a square invertible matrix or its adjoint (conjugate
+Computes the inverse of a square invertible matrix or its adjoint (conjugate
 
 transpose).
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.dynamic_rnn.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.dynamic_rnn.md
index 34a275c6a1d..623e04e33f0 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.dynamic_rnn.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.dynamic_rnn.md
@@ -14,8 +14,9 @@ The corresponding output is either a single `Tensor` having the same number
 of time steps and batch size, or a (possibly nested) tuple of such tensors,
 matching the nested structure of `cell.output_size`.
 
-The parameter `sequence_length` is required and dynamic calculation is
-automatically performed.
+The parameter `sequence_length` is optional and is used to copy-through state
+and zero-out outputs when past a batch element's sequence length. So it's more
+for correctness than performance, unlike in rnn().
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.strided_slice.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.strided_slice.md
new file mode 100644
index 00000000000..9c4be0347ea
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.strided_slice.md
@@ -0,0 +1,85 @@
+### `tf.strided_slice(input_, begin, end, strides, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0, name=None)` {#strided_slice}
+
+Extracts a strided slice from a tensor.
+
+To a first order, this operation extracts a slice of size `end - begin`
+from a tensor `input`
+starting at the location specified by `begin`. The slice continues by adding
+`stride` to the `begin` index until all dimensions are not less than `end`.
+Note that components of stride can be negative, which causes a reverse
+slice.
+
+This operation can be thought of an encoding of a numpy style sliced
+range. Given a python slice input[<spec0>, <spec1>, ..., <specn>]
+this function will be called as follows.
+
+`begin`, `end`, and `strides` will be all length n. n is in general
+not the same dimensionality as `input`.
+
+For the ith spec,
+`begin_mask`, `end_mask`, `ellipsis_mask`, `new_axis_mask`,
+and `shrink_axis_mask` will have the ith bit corrsponding to
+the ith spec.
+
+If the ith bit of `begin_mask` is non-zero, `begin[i]` is ignored and
+the fullest possible range in that dimension is used instead.
+`end_mask` works analogously, except with the end range.
+
+`foo[5:,:,:3]` on a 7x8x9 tensor is equivalent to `foo[5:7,0:8,0:3]`.
+`foo[::-1]` reverses a tensor with shape 8.
+
+
+If the ith bit of `ellipsis_mask`, as many unspecified dimensions
+as needed will be inserted between other dimensions. Only one
+non-zero bit is allowed in `ellipsis_mask`.
+
+For example `foo[3:5,...,4:5]` on a shape 10x3x3x10 tensor is
+equivalent to `foo[3:5,:,:,4:5]` and
+`foo[3:5,...]` is equivalent to `foo[3:5,:,:,:]`.
+
+If the ith bit of `new_axis_mask` is one, then a `begin`,
+`end`, and `stride` are ignored and a new length 1 dimension is
+added at this point in the output tensor.
+
+For example `foo[3:5,4]` on a 10x8 tensor produces a shape 2 tensor
+whereas `foo[3:5,4:5]` produces a shape 2x1 tensor with shrink_mask
+being 1<<1 == 2.
+
+If the ith bit of `shrink_axis_mask` is one, then `begin`,
+`end[i]`, and `stride[i]` are used to do a slice in the appropriate
+dimension, but the output tensor will be reduced in dimensionality
+by one. This is only valid if the ith entry of slice[i]==1.
+
+NOTE: `begin` and `end` are zero-indexed`.
+`strides` entries must be non-zero.
+
+
+```
+# 'input' is [[[1, 1, 1], [2, 2, 2]],
+#             [[3, 3, 3], [4, 4, 4]],
+#             [[5, 5, 5], [6, 6, 6]]]
+tf.slice(input, [1, 0, 0], [2, 1, 3], [1, 1, 1]) ==> [[[3, 3, 3]]]
+tf.slice(input, [1, 0, 0], [2, 2, 3], [1, 1, 1]) ==> [[[3, 3, 3],
+                                                       [4, 4, 4]]]
+tf.slice(input, [1, 1, 0], [2, -1, 3], [1, -1, 1]) ==>[[[4, 4, 4],
+                                                        [3, 3, 3]]]
+```
+
+##### Args:
+
+
+*  <b>`input_`</b>: A `Tensor`.
+*  <b>`begin`</b>: An `int32` or `int64` `Tensor`.
+*  <b>`end`</b>: An `int32` or `int64` `Tensor`.
+*  <b>`strides`</b>: An `int32` or `int64` `Tensor`.
+*  <b>`begin_mask`</b>: An `int32` mask.
+*  <b>`end_mask`</b>: An `int32` mask.
+*  <b>`ellipsis_mask`</b>: An `int32` mask.
+*  <b>`new_axis_mask`</b>: An `int32` mask.
+*  <b>`shrink_axis_mask`</b>: An `int32` mask.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor` the same type as `input`.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.ffmpeg.decode_audio.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.ffmpeg.decode_audio.md
index 31b9cba01f6..b4c74fc9211 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.ffmpeg.decode_audio.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.ffmpeg.decode_audio.md
@@ -21,5 +21,6 @@ Create an op that decodes the contents of an audio file.
 
   A rank 2 tensor that has time along dimension 0 and channels along
   dimension 1. Dimension 0 will be `samples_per_second * length` wide, and
-  dimension 1 will be `channel_count` wide.
+  dimension 1 will be `channel_count` wide. If ffmpeg fails to decode the
+  audio then an empty tensor will be returned.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.connect.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.connect.md
new file mode 100644
index 00000000000..134765ea06b
--- /dev/null
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.graph_editor.connect.md
@@ -0,0 +1,26 @@
+### `tf.contrib.graph_editor.connect(sgv0, sgv1, disconnect_first=False)` {#connect}
+
+Connect the outputs of sgv0 to the inputs of sgv1.
+
+##### Args:
+
+
+*  <b>`sgv0`</b>: the first subgraph to have its outputs swapped. This argument is
+    converted to a subgraph using the same rules as the function
+    subgraph.make_view.
+*  <b>`sgv1`</b>: the second subgraph to have its outputs swapped. This argument is
+    converted to a subgraph using the same rules as the function
+    subgraph.make_view.
+*  <b>`disconnect_first`</b>: if True the current outputs of sgv0 are disconnected.
+
+##### Returns:
+
+  Two new subgraph views (now connected). sgv0 and svg1 are also modified
+    in place.
+
+##### Raises:
+
+
+*  <b>`StandardError`</b>: if sgv0 or sgv1 cannot be converted to a SubGraphView using
+    the same rules than the function subgraph.make_view.
+
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
index 4b75dcc7b0d..95d0d145099 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.DNNRegressor.md
@@ -118,56 +118,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.DNNRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#DNNRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
@@ -181,37 +132,7 @@ for which this evaluation was performed.
 
 #### `tf.contrib.learn.DNNRegressor.fit(x=None, y=None, input_fn=None, steps=None, batch_size=None, monitors=None, max_steps=None)` {#DNNRegressor.fit}
 
-Trains a model given training data `x` predictions and `y` targets.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`steps`</b>: Number of steps for which to train model. If `None`, train forever.
-    If set, `max_steps` must be `None`.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`. Must be `None` if `input_fn` is provided.
-*  <b>`monitors`</b>: List of `BaseMonitor` subclass instances. Used for callbacks
-    inside the training loop.
-*  <b>`max_steps`</b>: Number of total steps for which to train model. If `None`,
-    train forever. If set, `steps` must be `None`.
-
-    Two calls to `fit(steps=100)` means 200 training
-    iterations. On the other hand, two calls to `fit(max_steps=100)` means
-    that the second call will not do any iteration since first call did
-    all 100 steps.
-
-##### Returns:
-
-  `self`, for chaining.
+See `Trainable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.TensorFlowLinearRegressor.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.TensorFlowLinearRegressor.md
index 161b7d5fd3f..a8362b03686 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.TensorFlowLinearRegressor.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.contrib.learn.TensorFlowLinearRegressor.md
@@ -31,56 +31,7 @@ Returns weights of deep neural network part.
 
 #### `tf.contrib.learn.TensorFlowLinearRegressor.evaluate(x=None, y=None, input_fn=None, feed_fn=None, batch_size=None, steps=None, metrics=None, name=None)` {#TensorFlowLinearRegressor.evaluate}
 
-Evaluates given model with provided evaluation data.
-
-Evaluates on the given input data. If `input_fn` is provided, that
-input function should raise an end-of-input exception (`OutOfRangeError` or
-`StopIteration`) after one epoch of the training data has been provided.
-
-By default, the whole evaluation dataset is used. If `steps` is provided,
-only `steps` batches of size `batch_size` are processed.
-
-The return value is a dict containing the metrics specified in `metrics`, as
-well as an entry `global_step` which contains the value of the global step
-for which this evaluation was performed.
-
-##### Args:
-
-
-*  <b>`x`</b>: Matrix of shape [n_samples, n_features...]. Can be iterator that
-     returns arrays of features. The training input samples for fitting the
-     model. If set, `input_fn` must be `None`.
-*  <b>`y`</b>: Vector or matrix [n_samples] or [n_samples, n_outputs]. Can be
-     iterator that returns array of targets. The training target values
-     (class labels in classification, real numbers in regression). If set,
-     `input_fn` must be `None`.
-*  <b>`input_fn`</b>: Input function. If set, `x`, `y`, and `batch_size` must be
-    `None`.
-*  <b>`feed_fn`</b>: Function creating a feed dict every time it is called. Called
-    once per iteration.
-*  <b>`batch_size`</b>: minibatch size to use on the input, defaults to first
-    dimension of `x`, if specified. Must be `None` if `input_fn` is
-    provided.
-*  <b>`steps`</b>: Number of steps for which to evaluate model. If `None`, evaluate
-    until running tensors generated by `metrics` raises an exception.
-*  <b>`metrics`</b>: Dict of metric ops to run. If `None`, the default metric
-    functions are used; if `{}`, no metrics are used. If model has one
-    output (i.e., returning single predction), keys are `str`, e.g.
-    `'accuracy'` - just a name of the metric that will show up in
-    the logs / summaries. Otherwise, keys are tuple of two `str`, e.g.
-    `('accuracy', 'classes')`- name of the metric and name of `Tensor` in
-    the predictions to run this metric on.
-
-    Metric ops should support streaming, e.g., returning
-    update_op and value tensors. See more details in
-    ../../../../metrics/python/metrics/ops/streaming_metrics.py.
-
-*  <b>`name`</b>: Name of the evaluation if user needs to run multiple evaluations on
-    different data sets, such as on training data vs test data.
-
-##### Returns:
-
-  Returns `dict` with evaluation results.
+See `Evaluable`.
 
 ##### Raises:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md
index bbbc297a94a..feb96eb180e 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.get_variable.md
@@ -1,4 +1,4 @@
-### `tf.get_variable(name, shape=None, dtype=tf.float32, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#get_variable}
+### `tf.get_variable(name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#get_variable}
 
 Gets an existing variable with these parameters or create a new one.
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.matrix_determinant.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.matrix_determinant.md
index a5cd5a7fe68..fcaa1b1c774 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.matrix_determinant.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.matrix_determinant.md
@@ -1,6 +1,6 @@
 ### `tf.matrix_determinant(input, name=None)` {#matrix_determinant}
 
-Calculates the determinant of a square matrix.
+Computes the determinant of a square matrix.
 
 ##### Args:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.rnn_cell.RNNCell.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.rnn_cell.RNNCell.md
index 5ecf4e515fe..ab13073fd5d 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.rnn_cell.RNNCell.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.nn.rnn_cell.RNNCell.md
@@ -1,5 +1,10 @@
 Abstract object representing an RNN cell.
 
+The definition of cell in this package differs from the definition used in the
+literature. In the literature, cell refers to an object with a single scalar
+output. The definition in this package refers to a horizontal array of such
+units.
+
 An RNN cell, in the most abstract setting, is anything that has
 a state and performs some operation that takes a matrix of inputs.
 This operation results in an output matrix with `self.output_size` columns.
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md
index 561e5e196f0..2e272685948 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.scan.md
@@ -1,4 +1,4 @@
-### `tf.scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, name=None)` {#scan}
+### `tf.scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, swap_memory=False, infer_shape=True, name=None)` {#scan}
 
 scan on the list of tensors unpacked from `elems` on dimension 0.
 
@@ -50,6 +50,7 @@ For example, if `elems` is `(t1, [t2, t3])` and `initializer` is
     in parallel.
 *  <b>`back_prop`</b>: (optional) True enables support for back propagation.
 *  <b>`swap_memory`</b>: (optional) True enables GPU-CPU memory swapping.
+*  <b>`infer_shape`</b>: (optional) False disables tests for consistent output shapes.
 *  <b>`name`</b>: (optional) Name prefix for the returned tensors.
 
 ##### Returns:
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md
index 38742123d64..b2f9570b2c6 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.sparse_merge.md
@@ -1,4 +1,4 @@
-### `tf.sparse_merge(sp_ids, sp_values, vocab_size, name=None)` {#sparse_merge}
+### `tf.sparse_merge(sp_ids, sp_values, vocab_size, name=None, already_sorted=False)` {#sparse_merge}
 
 Combines a batch of feature ids and values into a single `SparseTensor`.
 
@@ -18,14 +18,17 @@ The `SparseTensor` returned by this function has the following properties:
 
 For example, consider the following feature vectors:
 
+```python
   vector1 = [-3, 0, 0, 0, 0, 0]
   vector2 = [ 0, 1, 0, 4, 1, 0]
   vector3 = [ 5, 0, 0, 9, 0, 0]
+```
 
 These might be stored sparsely in the following Example protos by storing
 only the feature ids (column number if the vectors are treated as a matrix)
 of the non-zero elements and the corresponding values:
 
+```python
   examples = [Example(features={
                   "ids": Feature(int64_list=Int64List(value=[0])),
                   "values": Feature(float_list=FloatList(value=[-3]))}),
@@ -35,6 +38,7 @@ of the non-zero elements and the corresponding values:
               Example(features={
                   "ids": Feature(int64_list=Int64List(value=[0, 3])),
                   "values": Feature(float_list=FloatList(value=[5, 9]))})]
+```
 
 The result of calling parse_example on these examples will produce a
 dictionary with entries for "ids" and "values". Passing those two objects
@@ -47,9 +51,11 @@ batch, and the second dimension is the column number, i.e., the feature id);
 original matrix, i.e., (3, 6). For our example above, the output will be
 equal to:
 
+```python
   SparseTensor(indices=[[0, 0], [1, 1], [1, 3], [1, 4], [2, 0], [2, 3]],
                values=[-3, 1, 4, 1, 5, 9],
                shape=[3, 6])
+```
 
 ##### Args:
 
@@ -60,6 +66,9 @@ equal to:
 *  <b>`vocab_size`</b>: A scalar `int64` Tensor (or Python int) containing the new size
     of the last dimension, `all(0 <= sp_ids.values < vocab_size)`.
 *  <b>`name`</b>: A name prefix for the returned tensors (optional)
+*  <b>`already_sorted`</b>: A boolean to specify whether the per-batch values in
+   `sp_values` are already sorted. If so skip sorting, False by default
+   (optional).
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.squared_difference.md b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.squared_difference.md
index d6bb175669c..19f25f473da 100644
--- a/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.squared_difference.md
+++ b/tensorflow/g3doc/api_docs/python/functions_and_classes/shard9/tf.squared_difference.md
@@ -2,6 +2,9 @@
 
 Returns (x - y)(x - y) element-wise.
 
+*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
diff --git a/tensorflow/g3doc/api_docs/python/index.md b/tensorflow/g3doc/api_docs/python/index.md
index 49adce5f64f..c3ea11b4d3f 100644
--- a/tensorflow/g3doc/api_docs/python/index.md
+++ b/tensorflow/g3doc/api_docs/python/index.md
@@ -58,6 +58,7 @@
   * [`multinomial`](../../api_docs/python/constant_op.md#multinomial)
   * [`ones`](../../api_docs/python/constant_op.md#ones)
   * [`ones_like`](../../api_docs/python/constant_op.md#ones_like)
+  * [`ops`](../../api_docs/python/constant_op.md#ops)
   * [`random_crop`](../../api_docs/python/constant_op.md#random_crop)
   * [`random_gamma`](../../api_docs/python/constant_op.md#random_gamma)
   * [`random_normal`](../../api_docs/python/constant_op.md#random_normal)
@@ -120,6 +121,7 @@
   * [`boolean_mask`](../../api_docs/python/array_ops.md#boolean_mask)
   * [`cast`](../../api_docs/python/array_ops.md#cast)
   * [`concat`](../../api_docs/python/array_ops.md#concat)
+  * [`copy`](../../api_docs/python/array_ops.md#copy)
   * [`depth_to_space`](../../api_docs/python/array_ops.md#depth_to_space)
   * [`dynamic_partition`](../../api_docs/python/array_ops.md#dynamic_partition)
   * [`dynamic_stitch`](../../api_docs/python/array_ops.md#dynamic_stitch)
@@ -144,6 +146,7 @@
   * [`space_to_depth`](../../api_docs/python/array_ops.md#space_to_depth)
   * [`split`](../../api_docs/python/array_ops.md#split)
   * [`squeeze`](../../api_docs/python/array_ops.md#squeeze)
+  * [`strided_slice`](../../api_docs/python/array_ops.md#strided_slice)
   * [`string_to_number`](../../api_docs/python/array_ops.md#string_to_number)
   * [`tile`](../../api_docs/python/array_ops.md#tile)
   * [`to_bfloat16`](../../api_docs/python/array_ops.md#to_bfloat16)
@@ -185,6 +188,8 @@
   * [`batch_matrix_transpose`](../../api_docs/python/math_ops.md#batch_matrix_transpose)
   * [`batch_matrix_triangular_solve`](../../api_docs/python/math_ops.md#batch_matrix_triangular_solve)
   * [`batch_self_adjoint_eig`](../../api_docs/python/math_ops.md#batch_self_adjoint_eig)
+  * [`batch_self_adjoint_eigvals`](../../api_docs/python/math_ops.md#batch_self_adjoint_eigvals)
+  * [`batch_svd`](../../api_docs/python/math_ops.md#batch_svd)
   * [`ceil`](../../api_docs/python/math_ops.md#ceil)
   * [`cholesky`](../../api_docs/python/math_ops.md#cholesky)
   * [`cholesky_solve`](../../api_docs/python/math_ops.md#cholesky_solve)
@@ -250,6 +255,7 @@
   * [`segment_prod`](../../api_docs/python/math_ops.md#segment_prod)
   * [`segment_sum`](../../api_docs/python/math_ops.md#segment_sum)
   * [`self_adjoint_eig`](../../api_docs/python/math_ops.md#self_adjoint_eig)
+  * [`self_adjoint_eigvals`](../../api_docs/python/math_ops.md#self_adjoint_eigvals)
   * [`sign`](../../api_docs/python/math_ops.md#sign)
   * [`sin`](../../api_docs/python/math_ops.md#sin)
   * [`sparse_segment_mean`](../../api_docs/python/math_ops.md#sparse_segment_mean)
@@ -260,6 +266,7 @@
   * [`square`](../../api_docs/python/math_ops.md#square)
   * [`squared_difference`](../../api_docs/python/math_ops.md#squared_difference)
   * [`sub`](../../api_docs/python/math_ops.md#sub)
+  * [`svd`](../../api_docs/python/math_ops.md#svd)
   * [`tan`](../../api_docs/python/math_ops.md#tan)
   * [`trace`](../../api_docs/python/math_ops.md#trace)
   * [`transpose`](../../api_docs/python/math_ops.md#transpose)
@@ -459,6 +466,7 @@
   * [`l2_normalize`](../../api_docs/python/nn.md#l2_normalize)
   * [`learned_unigram_candidate_sampler`](../../api_docs/python/nn.md#learned_unigram_candidate_sampler)
   * [`local_response_normalization`](../../api_docs/python/nn.md#local_response_normalization)
+  * [`log_poisson_loss`](../../api_docs/python/nn.md#log_poisson_loss)
   * [`log_softmax`](../../api_docs/python/nn.md#log_softmax)
   * [`log_uniform_candidate_sampler`](../../api_docs/python/nn.md#log_uniform_candidate_sampler)
   * [`max_pool`](../../api_docs/python/nn.md#max_pool)
@@ -596,6 +604,7 @@
   * [`batch_matrix_diag_transform`](../../api_docs/python/contrib.distributions.md#batch_matrix_diag_transform)
   * [`Bernoulli`](../../api_docs/python/contrib.distributions.md#Bernoulli)
   * [`Beta`](../../api_docs/python/contrib.distributions.md#Beta)
+  * [`Binomial`](../../api_docs/python/contrib.distributions.md#Binomial)
   * [`Categorical`](../../api_docs/python/contrib.distributions.md#Categorical)
   * [`Chi2`](../../api_docs/python/contrib.distributions.md#Chi2)
   * [`Dirichlet`](../../api_docs/python/contrib.distributions.md#Dirichlet)
@@ -606,8 +615,10 @@
   * [`InverseGamma`](../../api_docs/python/contrib.distributions.md#InverseGamma)
   * [`kl`](../../api_docs/python/contrib.distributions.md#kl)
   * [`Laplace`](../../api_docs/python/contrib.distributions.md#Laplace)
+  * [`Multinomial`](../../api_docs/python/contrib.distributions.md#Multinomial)
   * [`MultivariateNormalCholesky`](../../api_docs/python/contrib.distributions.md#MultivariateNormalCholesky)
   * [`MultivariateNormalDiag`](../../api_docs/python/contrib.distributions.md#MultivariateNormalDiag)
+  * [`MultivariateNormalDiagPlusVDVT`](../../api_docs/python/contrib.distributions.md#MultivariateNormalDiagPlusVDVT)
   * [`MultivariateNormalFull`](../../api_docs/python/contrib.distributions.md#MultivariateNormalFull)
   * [`Normal`](../../api_docs/python/contrib.distributions.md#Normal)
   * [`normal_congugates_known_sigma_predictive`](../../api_docs/python/contrib.distributions.md#normal_congugates_known_sigma_predictive)
@@ -633,6 +644,7 @@
   * [`convert_to_tensor_or_sparse_tensor`](../../api_docs/python/contrib.framework.md#convert_to_tensor_or_sparse_tensor)
   * [`create_global_step`](../../api_docs/python/contrib.framework.md#create_global_step)
   * [`deprecated`](../../api_docs/python/contrib.framework.md#deprecated)
+  * [`deprecated_arg_values`](../../api_docs/python/contrib.framework.md#deprecated_arg_values)
   * [`get_global_step`](../../api_docs/python/contrib.framework.md#get_global_step)
   * [`get_graph_from_inputs`](../../api_docs/python/contrib.framework.md#get_graph_from_inputs)
   * [`get_local_variables`](../../api_docs/python/contrib.framework.md#get_local_variables)
@@ -657,6 +669,31 @@
   * [`with_same_shape`](../../api_docs/python/contrib.framework.md#with_same_shape)
   * [`with_shape`](../../api_docs/python/contrib.framework.md#with_shape)
 
+* **[Graph Editor (contrib)](../../api_docs/python/contrib.graph_editor.md)**:
+  * [`bypass`](../../api_docs/python/contrib.graph_editor.md#bypass)
+  * [`connect`](../../api_docs/python/contrib.graph_editor.md#connect)
+  * [`detach`](../../api_docs/python/contrib.graph_editor.md#detach)
+  * [`detach_inputs`](../../api_docs/python/contrib.graph_editor.md#detach_inputs)
+  * [`detach_outputs`](../../api_docs/python/contrib.graph_editor.md#detach_outputs)
+  * [`matcher`](../../api_docs/python/contrib.graph_editor.md#matcher)
+  * [`ph`](../../api_docs/python/contrib.graph_editor.md#ph)
+  * [`reroute_a2b`](../../api_docs/python/contrib.graph_editor.md#reroute_a2b)
+  * [`reroute_a2b_inputs`](../../api_docs/python/contrib.graph_editor.md#reroute_a2b_inputs)
+  * [`reroute_a2b_outputs`](../../api_docs/python/contrib.graph_editor.md#reroute_a2b_outputs)
+  * [`reroute_b2a`](../../api_docs/python/contrib.graph_editor.md#reroute_b2a)
+  * [`reroute_b2a_inputs`](../../api_docs/python/contrib.graph_editor.md#reroute_b2a_inputs)
+  * [`reroute_b2a_outputs`](../../api_docs/python/contrib.graph_editor.md#reroute_b2a_outputs)
+  * [`select_ops`](../../api_docs/python/contrib.graph_editor.md#select_ops)
+  * [`select_ts`](../../api_docs/python/contrib.graph_editor.md#select_ts)
+  * [`sgv`](../../api_docs/python/contrib.graph_editor.md#sgv)
+  * [`sgv_scope`](../../api_docs/python/contrib.graph_editor.md#sgv_scope)
+  * [`SubGraphView`](../../api_docs/python/contrib.graph_editor.md#SubGraphView)
+  * [`swap`](../../api_docs/python/contrib.graph_editor.md#swap)
+  * [`swap_inputs`](../../api_docs/python/contrib.graph_editor.md#swap_inputs)
+  * [`swap_outputs`](../../api_docs/python/contrib.graph_editor.md#swap_outputs)
+  * [`Transformer`](../../api_docs/python/contrib.graph_editor.md#Transformer)
+  * [`ts`](../../api_docs/python/contrib.graph_editor.md#ts)
+
 * **[Layers (contrib)](../../api_docs/python/contrib.layers.md)**:
   * [`apply_regularization`](../../api_docs/python/contrib.layers.md#apply_regularization)
   * [`avg_pool2d`](../../api_docs/python/contrib.layers.md#avg_pool2d)
@@ -742,12 +779,20 @@
   * [`get_losses`](../../api_docs/python/contrib.losses.md#get_losses)
   * [`get_regularization_losses`](../../api_docs/python/contrib.losses.md#get_regularization_losses)
   * [`get_total_loss`](../../api_docs/python/contrib.losses.md#get_total_loss)
+  * [`hinge_loss`](../../api_docs/python/contrib.losses.md#hinge_loss)
   * [`log_loss`](../../api_docs/python/contrib.losses.md#log_loss)
   * [`sigmoid_cross_entropy`](../../api_docs/python/contrib.losses.md#sigmoid_cross_entropy)
   * [`softmax_cross_entropy`](../../api_docs/python/contrib.losses.md#softmax_cross_entropy)
   * [`sum_of_pairwise_squares`](../../api_docs/python/contrib.losses.md#sum_of_pairwise_squares)
   * [`sum_of_squares`](../../api_docs/python/contrib.losses.md#sum_of_squares)
 
+* **[RNN (contrib)](../../api_docs/python/contrib.rnn.md)**:
+  * [`AttentionCellWrapper`](../../api_docs/python/contrib.rnn.md#AttentionCellWrapper)
+  * [`CoupledInputForgetGateLSTMCell`](../../api_docs/python/contrib.rnn.md#CoupledInputForgetGateLSTMCell)
+  * [`GridLSTMCell`](../../api_docs/python/contrib.rnn.md#GridLSTMCell)
+  * [`LSTMFusedCell`](../../api_docs/python/contrib.rnn.md#LSTMFusedCell)
+  * [`TimeFreqLSTMCell`](../../api_docs/python/contrib.rnn.md#TimeFreqLSTMCell)
+
 * **[Metrics (contrib)](../../api_docs/python/contrib.metrics.md)**:
   * [`accuracy`](../../api_docs/python/contrib.metrics.md#accuracy)
   * [`aggregate_metric_map`](../../api_docs/python/contrib.metrics.md#aggregate_metric_map)
diff --git a/tensorflow/g3doc/api_docs/python/math_ops.md b/tensorflow/g3doc/api_docs/python/math_ops.md
index 4a9ead85023..51cfff68af4 100644
--- a/tensorflow/g3doc/api_docs/python/math_ops.md
+++ b/tensorflow/g3doc/api_docs/python/math_ops.md
@@ -21,7 +21,8 @@ operators to your graph.
 
 Returns x + y element-wise.
 
-*NOTE*: Add supports broadcasting. AddN does not.
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 
 ##### Args:
 
@@ -41,6 +42,9 @@ Returns x + y element-wise.
 
 Returns x - y element-wise.
 
+*NOTE*: `Sub` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -59,6 +63,9 @@ Returns x - y element-wise.
 
 Returns x * y element-wise.
 
+*NOTE*: `Mul` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -77,6 +84,9 @@ Returns x * y element-wise.
 
 Returns x / y element-wise.
 
+*NOTE*: `Div` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -164,6 +174,9 @@ as well.
 
 Returns element-wise remainder of division.
 
+*NOTE*: `Mod` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -504,7 +517,10 @@ Returns element-wise largest integer not greater than x.
 
 ### `tf.maximum(x, y, name=None)` {#maximum}
 
-Returns the max of x and y (i.e. x > y ? x : y) element-wise, broadcasts.
+Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+
+*NOTE*: `Maximum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 
 ##### Args:
 
@@ -522,7 +538,10 @@ Returns the max of x and y (i.e. x > y ? x : y) element-wise, broadcasts.
 
 ### `tf.minimum(x, y, name=None)` {#minimum}
 
-Returns the min of x and y (i.e. x < y ? x : y) element-wise, broadcasts.
+Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+
+*NOTE*: `Minimum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 
 ##### Args:
 
@@ -749,6 +768,9 @@ Computes the complementary error function of `x` element-wise.
 
 Returns (x - y)(x - y) element-wise.
 
+*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
 ##### Args:
 
 
@@ -1365,7 +1387,7 @@ It is computed as:
 
 ### `tf.matrix_determinant(input, name=None)` {#matrix_determinant}
 
-Calculates the determinant of a square matrix.
+Computes the determinant of a square matrix.
 
 ##### Args:
 
@@ -1384,7 +1406,7 @@ Calculates the determinant of a square matrix.
 
 ### `tf.batch_matrix_determinant(input, name=None)` {#batch_matrix_determinant}
 
-Calculates the determinants for a batch of square matrices.
+Computes the determinants for a batch of square matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices. The output is a tensor containing the determinants
@@ -1407,7 +1429,7 @@ for all input submatrices `[..., :, :]`.
 
 ### `tf.matrix_inverse(input, adjoint=None, name=None)` {#matrix_inverse}
 
-Calculates the inverse of a square invertible matrix or its adjoint (conjugate
+Computes the inverse of a square invertible matrix or its adjoint (conjugate
 
 transpose).
 
@@ -1437,7 +1459,7 @@ garbage result.
 
 ### `tf.batch_matrix_inverse(input, adjoint=None, name=None)` {#batch_matrix_inverse}
 
-Calculates the inverse of square invertible matrices or their adjoints
+Computes the inverse of square invertible matrices or their adjoints
 
 (conjugate transposes).
 
@@ -1469,7 +1491,7 @@ garbage result.
 
 ### `tf.cholesky(input, name=None)` {#cholesky}
 
-Calculates the Cholesky decomposition of a square matrix.
+Computes the Cholesky decomposition of a square matrix.
 
 The input has to be symmetric and positive definite. Only the lower-triangular
 part of the input will be used for this operation. The upper-triangular part
@@ -1494,7 +1516,7 @@ input, `L`, so that `input = L L^*`.
 
 ### `tf.batch_cholesky(input, name=None)` {#batch_cholesky}
 
-Calculates the Cholesky decomposition of a batch of square matrices.
+Computes the Cholesky decomposition of a batch of square matrices.
 
 The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
 form square matrices, with the same constraints as the single matrix Cholesky
@@ -1590,56 +1612,6 @@ X[3, :, 2]  # Solution to the linear system A[3, :, :] x = RHS[3, :, 2]
 
 
 
-- - -
-
-### `tf.self_adjoint_eig(input, name=None)` {#self_adjoint_eig}
-
-Calculates the Eigen Decomposition of a square Self-Adjoint matrix.
-
-Only the lower-triangular part of the input will be used in this case. The
-upper-triangular part will not be read.
-
-The result is a M+1 x M matrix whose first row is the eigenvalues, and
-subsequent rows are eigenvectors.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`.
-    Shape is `[M, M]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. Shape is `[M+1, M]`.
-
-
-- - -
-
-### `tf.batch_self_adjoint_eig(input, name=None)` {#batch_self_adjoint_eig}
-
-Calculates the Eigen Decomposition of a batch of square self-adjoint matrices.
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices, with the same constraints as the single matrix
-SelfAdjointEig.
-
-The result is a '[..., M+1, M] matrix with [..., 0,:] containing the
-eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
-
-##### Args:
-
-
-*  <b>`input`</b>: A `Tensor`. Must be one of the following types: `float64`, `float32`.
-    Shape is `[..., M, M]`.
-*  <b>`name`</b>: A name for the operation (optional).
-
-##### Returns:
-
-  A `Tensor`. Has the same type as `input`. Shape is `[..., M+1, M]`.
-
-
-
 - - -
 
 ### `tf.matrix_solve(matrix, rhs, adjoint=None, name=None)` {#matrix_solve}
@@ -1886,6 +1858,179 @@ typically 6-7 times slower than the fast path. If `fast` is `False` then
 
 
 
+- - -
+
+### `tf.self_adjoint_eig(matrix, name=None)` {#self_adjoint_eig}
+
+Computes the eigen decomposition of a self-adjoint matrix.
+
+Computes the eigenvalues and eigenvectors of an N-by-N matrix `matrix` such
+that `matrix * v[:,i] = e(i) * v[:,i]`, for i=0...N-1.
+
+##### Args:
+
+
+*  <b>`matrix`</b>: `Tensor` of shape `[N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`e`</b>: Eigenvalues. Shape is `[N]`.
+*  <b>`v`</b>: Eigenvectors. Shape is `[N, N]`. The columns contain the eigenvectors of
+    `matrix`.
+
+
+- - -
+
+### `tf.batch_self_adjoint_eig(tensor, name=None)` {#batch_self_adjoint_eig}
+
+Computes the eigen decomposition of a batch of self-adjoint matrices.
+
+Computes the eigenvalues and eigenvectors of the innermost N-by-N matrices
+in `tensor` such that
+`tensor[...,:,:] * v[..., :,i] = e(..., i) * v[...,:,i]`, for i=0...N-1.
+
+##### Args:
+
+
+*  <b>`tensor`</b>: `Tensor` of shape `[..., N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`e`</b>: Eigenvalues. Shape is `[..., N]`.
+*  <b>`v`</b>: Eigenvectors. Shape is `[..., N, N]`. The columns of the inner most
+  matrices
+    contain eigenvectors of the corresponding matrices in `tensor`
+
+
+- - -
+
+### `tf.self_adjoint_eigvals(matrix, name=None)` {#self_adjoint_eigvals}
+
+Computes the eigenvalues a self-adjoint  matrix.
+
+##### Args:
+
+
+*  <b>`matrix`</b>: `Tensor` of shape `[N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`e`</b>: Eigenvalues of `matrix`. Shape is `[N]`.
+
+
+- - -
+
+### `tf.batch_self_adjoint_eigvals(tensor, name=None)` {#batch_self_adjoint_eigvals}
+
+Computes the eigenvalues of a batch of self-adjoint matrices.
+
+##### Args:
+
+
+*  <b>`tensor`</b>: `Tensor` of shape `[..., N, N]`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`e`</b>: Eigenvalues. Shape is `[..., N]`. The vector `e[..., :]` contains the `N`
+    eigenvalues of `tensor[..., :, :]`.
+
+
+
+- - -
+
+### `tf.svd(matrix, compute_uv=True, full_matrices=False, name=None)` {#svd}
+
+Computes the singular value decomposition of a matrix.
+
+Computes the SVD of `matrix` such that `matrix = u * diag(s) *
+transpose(v)`
+
+```prettyprint
+# a is a matrix.
+# s is a vector of singular values.
+# u is the matrix of left singular vectors.
+# v is a matrix of right singular vectors.
+s, u, v = svd(a)
+s = svd(a, compute_uv=False)
+```
+
+##### Args:
+
+
+*  <b>`matrix`</b>: `Tensor` of shape `[M, N]`. Let `P` be the minimum of `M` and `N`.
+*  <b>`compute_uv`</b>: If `True` then left and right singular vectors will be
+    computed and returned in `u` and `v`, respectively. Otherwise, only the
+    singular values will be computed, which can be significantly faster.
+*  <b>`full_matrices`</b>: If true, compute full-sized `u` and `v`. If false
+    (the default), compute only the leading `P` singular vectors.
+    Ignored if `compute_uv` is `False`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`s`</b>: Singular values. Shape is `[P]`.
+*  <b>`u`</b>: Right singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[M, P]`; if `full_matrices` is `True` then shape is
+    `[M, M]`. Not returned if `compute_uv` is `False`.
+*  <b>`v`</b>: Left singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[N, P]`. If `full_matrices` is `True` then shape is
+    `[N, N]`. Not returned if `compute_uv` is `False`.
+
+
+- - -
+
+### `tf.batch_svd(tensor, compute_uv=True, full_matrices=False, name=None)` {#batch_svd}
+
+Computes the singular value decompositions of a batch of matrices.
+
+Computes the SVD of each inner matrix in `tensor` such that
+`tensor[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :,
+:])`
+
+```prettyprint
+# a is a tensor.
+# s is a tensor of singular values.
+# u is a tensor of left singular vectors.
+# v is a tensor of right singular vectors.
+s, u, v = batch_svd(a)
+s = batch_svd(a, compute_uv=False)
+```
+
+##### Args:
+
+
+*  <b>`matrix`</b>: `Tensor` of shape `[..., M, N]`. Let `P` be the minimum of `M` and
+    `N`.
+*  <b>`compute_uv`</b>: If `True` then left and right singular vectors will be
+    computed and returned in `u` and `v`, respectively. Otherwise, only the
+    singular values will be computed, which can be significantly faster.
+*  <b>`full_matrices`</b>: If true, compute full-sized `u` and `v`. If false
+    (the default), compute only the leading `P` singular vectors.
+    Ignored if `compute_uv` is `False`.
+*  <b>`name`</b>: string, optional name of the operation.
+
+##### Returns:
+
+
+*  <b>`s`</b>: Singular values. Shape is `[..., P]`.
+*  <b>`u`</b>: Right singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+    `[..., M, M]`. Not returned if `compute_uv` is `False`.
+*  <b>`v`</b>: Left singular vectors. If `full_matrices` is `False` (default) then
+    shape is `[..., N, P]`. If `full_matrices` is `True` then shape is
+    `[..., N, N]`. Not returned if `compute_uv` is `False`.
+
+
+
 ## Complex Number Functions
 
 TensorFlow provides several operations that you can use to add complex number
@@ -2603,8 +2748,8 @@ tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
 
 
 *  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-   `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-   `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+     `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+     `complex128`, `qint8`, `quint8`, `qint32`, `half`.
 *  <b>`axis`</b>: A `Tensor` of type `int32` (default: 0).
 *  <b>`reverse`</b>: A `bool` (default: False).
 *  <b>`name`</b>: A name for the operation (optional).
@@ -2620,13 +2765,15 @@ tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
 
 Compute the cumulative product of the tensor `x` along `axis`.
 
-By default, this op performs an inclusive cumprod, which means that the first
+By default, this op performs an inclusive cumprod, which means that the
+first
 element of the input is identical to the first element of the output:
 ```prettyprint
 tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
 ```
 
-By setting the `exclusive` kwarg to `True`, an exclusive cumprod is performed
+By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+performed
 instead:
 ```prettyprint
 tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
@@ -2648,8 +2795,8 @@ tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
 
 
 *  <b>`x`</b>: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-   `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-   `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+     `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+     `complex128`, `qint8`, `quint8`, `qint32`, `half`.
 *  <b>`axis`</b>: A `Tensor` of type `int32` (default: 0).
 *  <b>`reverse`</b>: A `bool` (default: False).
 *  <b>`name`</b>: A name for the operation (optional).
diff --git a/tensorflow/g3doc/api_docs/python/nn.md b/tensorflow/g3doc/api_docs/python/nn.md
index 508cea34cdc..4268fdf5d21 100644
--- a/tensorflow/g3doc/api_docs/python/nn.md
+++ b/tensorflow/g3doc/api_docs/python/nn.md
@@ -368,7 +368,8 @@ same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
 *  <b>`strides`</b>: 1-D of size 4.  The stride of the sliding window for each
     dimension of `input`.
 *  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+    See the [comment
+      here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
 *  <b>`name`</b>: A name for this operation (optional).
 
 ##### Returns:
@@ -413,7 +414,8 @@ horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
 *  <b>`strides`</b>: 1-D of size 4.  The strides for the depthwise convolution for
     each dimension of `input`.
 *  <b>`padding`</b>: A string, either `'VALID'` or `'SAME'`.  The padding algorithm.
-    See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+    See the [comment
+      here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
 *  <b>`name`</b>: A name for this operation (optional).
 
 ##### Returns:
@@ -1095,6 +1097,53 @@ Computes half the L2 norm of a tensor without the `sqrt`:
   A `Tensor`. Has the same type as `t`. 0-D.
 
 
+- - -
+
+### `tf.nn.log_poisson_loss(log_input, targets, compute_full_loss=False, name=None)` {#log_poisson_loss}
+
+Computes log poisson loss given `log_input`.
+
+Gives the log-likelihood loss between the prediction and the target under the
+assumption that the target has a poisson distribution.
+Caveat: By default, this is not the exact loss, but the loss minus a
+  constant term [log(z!)]. That has no effect for optimization, but
+  does not play well with relative loss comparisons. To compute an
+  approximation of the log factorial term, specify
+  compute_full_loss=True to enable Stirling's Approximation.
+
+For brevity, let `c = log(x) = log_input`, `z = targets`.  The log poisson
+loss is
+
+      -log(exp(-x) * (x^z) / z!)
+    = -log(exp(-x) * (x^z)) + log(z!)
+    ~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
+        [ Note the second term is the Stirling's Approximation for log(z!).
+          It is invariant to x and does not affect optimization, though
+          important for correct relative loss comparisons. It is only
+          computed when compute_full_loss == True. ]
+    = x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
+    = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
+
+##### Args:
+
+
+*  <b>`log_input`</b>: A `Tensor` of type `float32` or `float64`.
+*  <b>`targets`</b>: A `Tensor` of the same type and shape as `log_input`.
+*  <b>`compute_full_loss`</b>: whether to compute the full loss. If false, a constant
+    term is dropped in favor of more efficient optimization.
+*  <b>`name`</b>: A name for the operation (optional).
+
+##### Returns:
+
+  A `Tensor` of the same shape as `log_input` with the componentwise
+  logistic losses.
+
+##### Raises:
+
+
+*  <b>`ValueError`</b>: If `log_input` and `targets` do not have the same shape.
+
+
 
 ## Classification
 
@@ -1490,8 +1539,9 @@ The corresponding output is either a single `Tensor` having the same number
 of time steps and batch size, or a (possibly nested) tuple of such tensors,
 matching the nested structure of `cell.output_size`.
 
-The parameter `sequence_length` is required and dynamic calculation is
-automatically performed.
+The parameter `sequence_length` is optional and is used to copy-through state
+and zero-out outputs when past a batch element's sequence length. So it's more
+for correctness than performance, unlike in rnn().
 
 ##### Args:
 
@@ -1582,15 +1632,15 @@ automatically performed.
 
 Creates a recurrent neural network specified by RNNCell `cell`.
 
-##### The simplest form of RNN network generated is:
-
+The simplest form of RNN network generated is:
+```py
   state = cell.zero_state(...)
   outputs = []
   for input_ in inputs:
     output, state = cell(input_, state)
     outputs.append(output)
   return (outputs, state)
-
+```
 However, a few other options are available:
 
 An initial state can be provided.
diff --git a/tensorflow/g3doc/api_docs/python/rnn_cell.md b/tensorflow/g3doc/api_docs/python/rnn_cell.md
index 94b48f5d416..5fcbd27966a 100644
--- a/tensorflow/g3doc/api_docs/python/rnn_cell.md
+++ b/tensorflow/g3doc/api_docs/python/rnn_cell.md
@@ -13,6 +13,11 @@ Module for constructing RNN Cells.
 
 Abstract object representing an RNN cell.
 
+The definition of cell in this package differs from the definition used in the
+literature. In the literature, cell refers to an object with a single scalar
+output. The definition in this package refers to a horizontal array of such
+units.
+
 An RNN cell, in the most abstract setting, is anything that has
 a state and performs some operation that takes a matrix of inputs.
 This operation results in an output matrix with `self.output_size` columns.
diff --git a/tensorflow/g3doc/api_docs/python/sparse_ops.md b/tensorflow/g3doc/api_docs/python/sparse_ops.md
index a1d9d23bea7..e934dc765cd 100644
--- a/tensorflow/g3doc/api_docs/python/sparse_ops.md
+++ b/tensorflow/g3doc/api_docs/python/sparse_ops.md
@@ -350,7 +350,7 @@ The input `SparseTensor` must be in row-major order.
 
 - - -
 
-### `tf.sparse_merge(sp_ids, sp_values, vocab_size, name=None)` {#sparse_merge}
+### `tf.sparse_merge(sp_ids, sp_values, vocab_size, name=None, already_sorted=False)` {#sparse_merge}
 
 Combines a batch of feature ids and values into a single `SparseTensor`.
 
@@ -370,14 +370,17 @@ The `SparseTensor` returned by this function has the following properties:
 
 For example, consider the following feature vectors:
 
+```python
   vector1 = [-3, 0, 0, 0, 0, 0]
   vector2 = [ 0, 1, 0, 4, 1, 0]
   vector3 = [ 5, 0, 0, 9, 0, 0]
+```
 
 These might be stored sparsely in the following Example protos by storing
 only the feature ids (column number if the vectors are treated as a matrix)
 of the non-zero elements and the corresponding values:
 
+```python
   examples = [Example(features={
                   "ids": Feature(int64_list=Int64List(value=[0])),
                   "values": Feature(float_list=FloatList(value=[-3]))}),
@@ -387,6 +390,7 @@ of the non-zero elements and the corresponding values:
               Example(features={
                   "ids": Feature(int64_list=Int64List(value=[0, 3])),
                   "values": Feature(float_list=FloatList(value=[5, 9]))})]
+```
 
 The result of calling parse_example on these examples will produce a
 dictionary with entries for "ids" and "values". Passing those two objects
@@ -399,9 +403,11 @@ batch, and the second dimension is the column number, i.e., the feature id);
 original matrix, i.e., (3, 6). For our example above, the output will be
 equal to:
 
+```python
   SparseTensor(indices=[[0, 0], [1, 1], [1, 3], [1, 4], [2, 0], [2, 3]],
                values=[-3, 1, 4, 1, 5, 9],
                shape=[3, 6])
+```
 
 ##### Args:
 
@@ -412,6 +418,9 @@ equal to:
 *  <b>`vocab_size`</b>: A scalar `int64` Tensor (or Python int) containing the new size
     of the last dimension, `all(0 <= sp_ids.values < vocab_size)`.
 *  <b>`name`</b>: A name prefix for the returned tensors (optional)
+*  <b>`already_sorted`</b>: A boolean to specify whether the per-batch values in
+   `sp_values` are already sorted. If so skip sorting, False by default
+   (optional).
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/state_ops.md b/tensorflow/g3doc/api_docs/python/state_ops.md
index 250886db3a7..4b5f4cf353c 100644
--- a/tensorflow/g3doc/api_docs/python/state_ops.md
+++ b/tensorflow/g3doc/api_docs/python/state_ops.md
@@ -1065,7 +1065,7 @@ create variables contingent on certain conditions.
 
 - - -
 
-### `tf.get_variable(name, shape=None, dtype=tf.float32, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#get_variable}
+### `tf.get_variable(name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#get_variable}
 
 Gets an existing variable with these parameters or create a new one.
 
@@ -1167,9 +1167,10 @@ Attributes:
   partitioner: callable or `None`: the partitioner passed to `get_variable`.
   custom_getter: default custom getter passed to get_variable.
   name_scope: The name passed to `tf.name_scope`.
+  dtype: default type passed to get_variable (defaults to DT_FLOAT).
 - - -
 
-#### `tf.VariableScope.__init__(reuse, name='', initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, name_scope='')` {#VariableScope.__init__}
+#### `tf.VariableScope.__init__(reuse, name='', initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, name_scope='', dtype=tf.float32)` {#VariableScope.__init__}
 
 Creates a new VariableScope with the given properties.
 
@@ -1190,7 +1191,14 @@ Creates a new VariableScope with the given properties.
 
 - - -
 
-#### `tf.VariableScope.get_variable(var_store, name, shape=None, dtype=tf.float32, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#VariableScope.get_variable}
+#### `tf.VariableScope.dtype` {#VariableScope.dtype}
+
+
+
+
+- - -
+
+#### `tf.VariableScope.get_variable(var_store, name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None)` {#VariableScope.get_variable}
 
 Gets an existing variable with this name or create a new one.
 
@@ -1258,6 +1266,13 @@ Set caching_device for this scope.
 Set custom getter for this scope.
 
 
+- - -
+
+#### `tf.VariableScope.set_dtype(dtype)` {#VariableScope.set_dtype}
+
+Set data type for this scope.
+
+
 - - -
 
 #### `tf.VariableScope.set_initializer(initializer)` {#VariableScope.set_initializer}
@@ -1282,7 +1297,7 @@ Set regularizer for this scope.
 
 - - -
 
-### `tf.variable_scope(name_or_scope, reuse=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None)` {#variable_scope}
+### `tf.variable_scope(name_or_scope, reuse=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, dtype=None)` {#variable_scope}
 
 Returns a context for variable scope.
 
@@ -1353,6 +1368,8 @@ then all its sub-scopes become reusing as well.
 *  <b>`caching_device`</b>: default caching device for variables within this scope.
 *  <b>`partitioner`</b>: default partitioner for variables within this scope.
 *  <b>`custom_getter`</b>: default custom getter for variables within this scope.
+*  <b>`dtype`</b>: type of variables created in this scope (defaults to the type
+    in the passed scope, or inherited from parent scope).
 
 ##### Returns:
 
@@ -1368,7 +1385,7 @@ then all its sub-scopes become reusing as well.
 
 - - -
 
-### `tf.variable_op_scope(values, name_or_scope, default_name=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, reuse=None)` {#variable_op_scope}
+### `tf.variable_op_scope(values, name_or_scope, default_name=None, initializer=None, regularizer=None, caching_device=None, partitioner=None, custom_getter=None, reuse=None, dtype=None)` {#variable_op_scope}
 
 Returns a context manager for defining an op that creates variables.
 
@@ -1412,6 +1429,8 @@ def my_op_with_vars(a, b, scope=None):
 *  <b>`custom_getter`</b>: The default custom getter for variables within this scope.
 *  <b>`reuse`</b>: `True` or `None`; if `True`, we go into reuse mode for this scope as
     well as all sub-scopes; if `None`, we just inherit the parent scope reuse.
+*  <b>`dtype`</b>: The default type of variables created in this scope, defaults to the
+    type of the parent scope.
 
 ##### Returns:
 
diff --git a/tensorflow/g3doc/api_docs/python/train.md b/tensorflow/g3doc/api_docs/python/train.md
index 792fb2bcb78..e080769f41b 100644
--- a/tensorflow/g3doc/api_docs/python/train.md
+++ b/tensorflow/g3doc/api_docs/python/train.md
@@ -851,7 +851,7 @@ learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                            100000, 0.96, staircase=True)
 # Passing global_step to minimize() will increment it at each step.
 learning_step = (
-    tf.GradientDescentOptimizer(learning_rate)
+    tf.train.GradientDescentOptimizer(learning_rate)
     .minimize(...my loss..., global_step=global_step)
 )
 ```
@@ -1251,11 +1251,14 @@ After this is called, calls to `should_stop()` will return `False`.
 
 - - -
 
-#### `tf.train.Coordinator.join(threads, stop_grace_period_secs=120)` {#Coordinator.join}
+#### `tf.train.Coordinator.join(threads=None, stop_grace_period_secs=120)` {#Coordinator.join}
 
 Wait for threads to terminate.
 
-Blocks until all `threads` have terminated or `request_stop()` is called.
+This call blocks until a set of threads have terminated.  The set of thread
+is the union of the threads passed in the `threads` argument and the list
+of threads that registered with the coordinator by calling
+`Coordinator.register_thread()`.
 
 After the threads stop, if an `exc_info` was passed to `request_stop`, that
 exception is re-raised.
@@ -1269,7 +1272,8 @@ that `RuntimeError`.
 ##### Args:
 
 
-*  <b>`threads`</b>: List of `threading.Threads`. The started threads to join.
+*  <b>`threads`</b>: List of `threading.Threads`. The started threads to join in
+    addition to the registered threads.
 *  <b>`stop_grace_period_secs`</b>: Number of seconds given to threads to stop after
     `request_stop()` has been called.
 
@@ -1287,6 +1291,18 @@ that `RuntimeError`.
 
 
 
+- - -
+
+#### `tf.train.Coordinator.register_thread(thread)` {#Coordinator.register_thread}
+
+Register a thread to join.
+
+##### Args:
+
+
+*  <b>`thread`</b>: A Python thread to join.
+
+
 - - -
 
 #### `tf.train.Coordinator.request_stop(ex=None)` {#Coordinator.request_stop}
diff --git a/tensorflow/g3doc/get_started/os_setup.md b/tensorflow/g3doc/get_started/os_setup.md
index 92f77b27b07..d85ae429fa9 100644
--- a/tensorflow/g3doc/get_started/os_setup.md
+++ b/tensorflow/g3doc/get_started/os_setup.md
@@ -61,31 +61,37 @@ Then, select the correct binary to install:
 
 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
 # Requires CUDA toolkit 7.5 and CuDNN v4. For other versions, see "Install from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 2.7:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/tensorflow-0.9.0-py2-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.10.0rc0-py2-none-any.whl
+
+# Mac OS X, GPU enabled, Python 2.7:
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.10.0rc0-py2-none-any.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.9.0-cp34-cp34m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
 # Requires CUDA toolkit 7.5 and CuDNN v4. For other versions, see "Install from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.9.0-cp34-cp34m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.9.0-cp35-cp35m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.10.0rc0-cp35-cp35m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
 # Requires CUDA toolkit 7.5 and CuDNN v4. For other versions, see "Install from sources" below.
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.9.0-cp35-cp35m-linux_x86_64.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.10.0rc0-cp35-cp35m-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 3.4 or 3.5:
-$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/tensorflow-0.9.0-py3-none-any.whl
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.10.0rc0-py3-none-any.whl
+
+# Mac OS X, GPU enabled, Python 3.4 or 3.5:
+$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.10.0rc0-py3-none-any.whl
 ```
 
 Install TensorFlow:
@@ -151,31 +157,37 @@ Now, install TensorFlow just as you would for a regular Pip installation. First
 
 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
 # Requires CUDA toolkit 7.5 and CuDNN v4. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/tensorflow-0.9.0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.10.0rc0-py2-none-any.whl
+
+# Mac OS X, GPU enabled, Python 2.7:
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.10.0rc0-py2-none-any.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.9.0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
 # Requires CUDA toolkit 7.5 and CuDNN v4. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.9.0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.9.0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.10.0rc0-cp35-cp35m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
 # Requires CUDA toolkit 7.5 and CuDNN v4. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.9.0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.10.0rc0-cp35-cp35m-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/tensorflow-0.9.0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.10.0rc0-py3-none-any.whl
+
+# Mac OS X, GPU enabled, Python 3.4 or 3.5:
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.10.0rc0-py3-none-any.whl
 ```
 
 Finally install TensorFlow:
@@ -228,6 +240,7 @@ packages needed by TensorFlow.
 *  Activate the conda environment and install TensorFlow in it.
 *  After the install you will activate the conda environment each time you
    want to use TensorFlow.
+*  Optionally install ipython and other packages into the conda environment 
 
 Install Anaconda:
 
@@ -248,6 +261,7 @@ $ conda create -n tensorflow python=3.5
 
 Activate the environment and use conda or pip to install TensorFlow inside it.
 
+
 ### Using conda
 
 A community maintained conda package is available [from conda-forge](https://github.com/conda-forge/tensorflow-feedstock).
@@ -275,31 +289,37 @@ Now, install TensorFlow just as you would for a regular Pip installation. First
 
 ```bash
 # Ubuntu/Linux 64-bit, CPU only, Python 2.7
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7
 # Requires CUDA toolkit 7.5 and CuDNN v4. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 2.7:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/tensorflow-0.9.0-py2-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.10.0rc0-py2-none-any.whl
+
+# Mac OS X, GPU enabled, Python 2.7:
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.10.0rc0-py2-none-any.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.4
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.9.0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.4
 # Requires CUDA toolkit 7.5 and CuDNN v4. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.9.0-cp34-cp34m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.10.0rc0-cp34-cp34m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, CPU only, Python 3.5
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.9.0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.10.0rc0-cp35-cp35m-linux_x86_64.whl
 
 # Ubuntu/Linux 64-bit, GPU enabled, Python 3.5
 # Requires CUDA toolkit 7.5 and CuDNN v4. For other versions, see "Install from sources" below.
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.9.0-cp35-cp35m-linux_x86_64.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow-0.10.0rc0-cp35-cp35m-linux_x86_64.whl
 
 # Mac OS X, CPU only, Python 3.4 or 3.5:
-(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/tensorflow-0.9.0-py3-none-any.whl
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.10.0rc0-py3-none-any.whl
+
+# Mac OS X, GPU enabled, Python 3.4 or 3.5:
+(tensorflow)$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.10.0rc0-py3-none-any.whl
 ```
 
 Finally install TensorFlow:
@@ -336,6 +356,19 @@ $ source activate tensorflow
 (tensorflow)$ source deactivate
 ```
 
+### Install IPython
+
+To use tensorflow with IPython it may be necessary to install IPython into the tensorflow environment: 
+
+```bash
+$ source activate tensorflow
+(tensorflow)$ conda install ipython
+```
+
+Similarly, other Python packages like pandas may need to get installed into the tensorflow environment
+before they can be used together with tensorflow.  
+
+
 ## Docker installation
 
 [Docker](http://docker.com/) is a system to build self contained versions of a
@@ -352,7 +385,7 @@ code.
 * `gcr.io/tensorflow/tensorflow:latest-devel-gpu`: GPU Binary image plus source
 code.
 
-We also have tags with `latest` replaced by a released version (e.g., `0.9.0-gpu`).
+We also have tags with `latest` replaced by a released version (e.g., `0.10.0rc0-gpu`).
 
 With Docker the installation is as follows:
 
@@ -411,7 +444,7 @@ variables.  Consider adding the commands below to your `~/.bash_profile`.  These
 assume your CUDA installation is in `/usr/local/cuda`:
 
 ```bash
-export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64"
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 export CUDA_HOME=/usr/local/cuda
 ```
 
@@ -594,6 +627,8 @@ which you can install as follows:
 $ sudo easy_install ipython
 ```
 
+#### Optional: Setup GPU for Mac
+
 If you plan to  build with GPU support you will need to make sure you have
 GNU coreutils installed via homebrew:
 
@@ -634,6 +669,26 @@ $ sudo mv lib/libcudnn* /Developer/NVIDIA/CUDA-7.5/lib
 $ sudo ln -s /Developer/NVIDIA/CUDA-7.5/lib/libcudnn* /usr/local/cuda/lib/
 ```
 
+To verify the CUDA installation, you can build and run deviceQuery to make sure
+it passes.
+
+```bash
+$ cp -r /usr/local/cuda/samples ~/cuda-samples
+$ pushd ~/cuda-samples
+$ make
+$ popd
+$ ~/cuda-samples/bin/x86_64/darwin/release/deviceQuery
+```
+
+If you want to compile tensorflow and have the XCode 7.3 installed, note that
+Xcode 7.3 is not yet compatible with CUDA 7.5. You will need to download Xcode
+7.2 and select it as your default:
+
+```bash
+$ sudo xcode-select -s /Application/Xcode-7.2/Xcode.app
+```
+
+
 ### Configure the installation
 
 Run the `configure` script at the root of the tree.  The configure script
@@ -719,7 +774,7 @@ $ bazel build -c opt --config=cuda //tensorflow/tools/pip_package:build_pip_pack
 $ bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
 
 # The name of the .whl file will depend on your platform.
-$ sudo pip install /tmp/tensorflow_pkg/tensorflow-0.9.0-py2-none-any.whl
+$ sudo pip install /tmp/tensorflow_pkg/tensorflow-0.10.0rc0-py2-none-any.whl
 ```
 
 ## Setting up TensorFlow for Development
diff --git a/tensorflow/g3doc/how_tos/adding_an_op/index.md b/tensorflow/g3doc/how_tos/adding_an_op/index.md
index 59cab6329a2..767364b16df 100644
--- a/tensorflow/g3doc/how_tos/adding_an_op/index.md
+++ b/tensorflow/g3doc/how_tos/adding_an_op/index.md
@@ -1011,13 +1011,13 @@ function which computes gradients with respect to the ops' inputs given
 gradients with respect to the ops' outputs.
 
 Mathematically, if an op computes \\(y = f(x)\\) the registered gradient op
-converts gradients \\(\partial / \partial y\\) with respect to \\(y\\) into
-gradients \\(\partial / \partial x\\) with respect to \\(x\\) via the chain
-rule:
+converts gradients \\(\partial L/ \partial y\\) of loss \\(L\\) with respect to
+\\(y\\) into gradients \\(\partial L/ \partial x\\) with respect to \\(x\\) via
+the chain rule:
 
-$$\frac{\partial}{\partial x}
-    = \frac{\partial}{\partial y} \frac{\partial y}{\partial x}
-    = \frac{\partial}{\partial y} \frac{\partial f}{\partial x}.$$
+$$\frac{\partial L}{\partial x}
+    = \frac{\partial L}{\partial y} \frac{\partial y}{\partial x}
+    = \frac{\partial L}{\partial y} \frac{\partial f}{\partial x}.$$
 
 In the case of `ZeroOut`, only one entry in the input affects the output, so the
 gradient with respect to the input is a sparse "one hot" tensor.  This is
diff --git a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
index 0d733ce9941..8183cdf0247 100644
--- a/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
+++ b/tensorflow/g3doc/how_tos/summaries_and_tensorboard/index.md
@@ -201,4 +201,4 @@ For in depth information on how to use the *graph* tab to visualize your graph,
 see [TensorBoard: Graph Visualization](../../how_tos/graph_viz/index.md).
 
 For more usage information on TensorBoard in general, see the [TensorBoard
-Readme](../../../tensorboard/README.md).
+README](https://www.tensorflow.org/code/tensorflow/tensorboard/README.md).
diff --git a/tensorflow/g3doc/how_tos/using_gpu/index.md b/tensorflow/g3doc/how_tos/using_gpu/index.md
index e3e16fa5752..47f14a95189 100644
--- a/tensorflow/g3doc/how_tos/using_gpu/index.md
+++ b/tensorflow/g3doc/how_tos/using_gpu/index.md
@@ -58,7 +58,7 @@ within that context will have the same device assignment.
 with tf.device('/cpu:0'):
   a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
   b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
-c = tf.matmul(a, b)
+  c = tf.matmul(a, b)
 # Creates a session with log_device_placement set to True.
 sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
 # Runs the op.
diff --git a/tensorflow/g3doc/resources/faq.md b/tensorflow/g3doc/resources/faq.md
index 61437286545..6dc208f8dd6 100644
--- a/tensorflow/g3doc/resources/faq.md
+++ b/tensorflow/g3doc/resources/faq.md
@@ -147,7 +147,7 @@ graphs and running steps; we also have an experimental API for
 
 We would like to support more client languages, as determined by community
 interest. TensorFlow has a
-[C-based client API](https://www.tensorflow.org/code/tensorflow/core/public/tensor_c_api.h)
+[C-based client API](https://www.tensorflow.org/code/tensorflow/c/c_api.h)
 that makes it easy to build a client in many different languages. We invite
 contributions of new language bindings.
 
diff --git a/tensorflow/g3doc/tutorials/index.md b/tensorflow/g3doc/tutorials/index.md
index 292596837da..c634a6f6add 100644
--- a/tensorflow/g3doc/tutorials/index.md
+++ b/tensorflow/g3doc/tutorials/index.md
@@ -2,6 +2,10 @@
 
 ## Basic Neural Networks
 
+The first few Tensorflow tutorials guide you through training and testing a
+simple neural network to classify handwritten digits from the MNIST database of
+digit images.
+
 ### MNIST For ML Beginners
 
 If you're new to machine learning, we recommend starting here.  You'll learn
@@ -27,13 +31,6 @@ example.
 
 [View Tutorial](../tutorials/mnist/tf/index.md)
 
-### MNIST Data Download
-
-Details about downloading the MNIST handwritten digits data set.  Exciting
-stuff.
-
-[View Tutorial](../tutorials/mnist/download/index.md)
-
 
 ## Easy ML with tf.contrib.learn
 
@@ -66,6 +63,12 @@ model and a deep neural net to harness the advantages of each type of model.
 
 [View Tutorial](../tutorials/wide_and_deep/index.md)
 
+### Logging and Monitoring Basics with tf.contrib.learn
+
+This tutorial shows you how to use TensorFlow’s logging capabilities and the
+Monitor API to audit the in-progress training of a neural network.
+
+[View Tutorial](../tutorials/monitors/index.md)
 
 ## TensorFlow Serving
 
diff --git a/tensorflow/g3doc/tutorials/leftnav_files b/tensorflow/g3doc/tutorials/leftnav_files
index 09cd084b490..75ef57f59fa 100644
--- a/tensorflow/g3doc/tutorials/leftnav_files
+++ b/tensorflow/g3doc/tutorials/leftnav_files
@@ -2,12 +2,12 @@
 mnist/beginners/index.md
 mnist/pros/index.md
 mnist/tf/index.md
-mnist/download/index.md
 ### Easy ML with tf.contrib.learn
 tflearn/index.md
 linear/overview.md
 wide/index.md
 wide_and_deep/index.md
+monitors/index.md
 ### TensorFlow Serving
 tfserve/index.md
 ### Image Processing
diff --git a/tensorflow/g3doc/tutorials/linear/overview.md b/tensorflow/g3doc/tutorials/linear/overview.md
index f8fd1ab0de8..aafa1585760 100644
--- a/tensorflow/g3doc/tutorials/linear/overview.md
+++ b/tensorflow/g3doc/tutorials/linear/overview.md
@@ -174,11 +174,11 @@ that value.
 indicating how to represent and transform the data. But they do not provide
 the data itself. You provide the data through an input function.
 
-The input function must return a dictionary of tensors. Each key corresponds
-to the name of a `FeatureColumn`. Each key's value is a tensor containing the
+The input function must return a dictionary of tensors. Each key corresponds to
+the name of a `FeatureColumn`. Each key's value is a tensor containing the
 values of that feature for all data instances. See `input_fn` in the [linear
-models tutorial code](
-https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py?l=160)
+models tutorial code]
+(https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py)
 for an example of an input function.
 
 The input function is passed to the `fit()` and `evaluate()` calls that
diff --git a/tensorflow/g3doc/tutorials/mnist/beginners/index.md b/tensorflow/g3doc/tutorials/mnist/beginners/index.md
index 5d099c4bf2f..e3302db200c 100644
--- a/tensorflow/g3doc/tutorials/mnist/beginners/index.md
+++ b/tensorflow/g3doc/tutorials/mnist/beginners/index.md
@@ -1,11 +1,11 @@
 # MNIST For ML Beginners
 
 *This tutorial is intended for readers who are new to both machine learning and
-TensorFlow. If you already
-know what MNIST is, and what softmax (multinomial logistic) regression is,
-you might prefer this [faster paced tutorial](../pros/index.md).
-Be sure to [install TensorFlow](../../../get_started/os_setup.md) before
-starting either tutorial.*
+TensorFlow. If you already know what MNIST is, and what softmax (multinomial
+logistic) regression is, you might prefer this
+[faster paced tutorial](../pros/index.md).  Be sure to
+[install TensorFlow](../../../get_started/os_setup.md) before starting either
+tutorial.*
 
 When one learns how to program, there's a tradition that the first thing you do
 is print "Hello World." Just like programming has Hello World, machine learning
@@ -33,21 +33,45 @@ important to understand the ideas behind it: both how TensorFlow works and the
 core machine learning concepts. Because of this, we are going to very carefully
 work through the code.
 
+## About this tutorial
+
+This tutorial is an explanation, line by line, of what is happening in the
+[mnist_softmax.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_softmax.py) code.
+
+You can use this tutorial in a few different ways, including:
+
+- Copy and paste each code snippet, line by line, into a Python environment as
+  you read through the explanations of each line.
+
+- Run the entire `mnist_softmax.py` Python file either before or after reading
+  through the explanations, and use this tutorial to understand the lines of
+  code that aren't clear to you.
+
+What we will accomplish in this tutorial:
+
+- Learn about the MNIST data and softmax regressions
+
+- Create a function that is a model for recognizing digits, based on looking at
+  every pixel in the image
+
+- Use Tensorflow to train the model to recognize digits by having it "look" at
+  thousands of examples (and run our first Tensorflow session to do so)
+
+- Check the model's accuracy with our test data
+
 ## The MNIST Data
 
 The MNIST data is hosted on
-[Yann LeCun's website](http://yann.lecun.com/exdb/mnist/).  For your
-convenience, we've included some python code to download and install the data
-automatically. You can either download
-[the code](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/input_data.py)
-and import it as below, or simply copy and paste it in.
+[Yann LeCun's website](http://yann.lecun.com/exdb/mnist/).  If you are copying and
+pasting in the code from this tutorial, start here with these two lines of code
+which will download and read in the data automatically:
 
 ```python
 from tensorflow.examples.tutorials.mnist import input_data
 mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
 ```
 
-The downloaded data is split into three parts, 55,000 data points of training
+The MNIST data is split into three parts: 55,000 data points of training
 data (`mnist.train`), 10,000 points of test data (`mnist.test`), and 5,000
 points of validation data (`mnist.validation`). This split is very important:
 it's essential in machine learning that we have separate data which we don't
@@ -55,10 +79,10 @@ learn from so that we can make sure that what we've learned actually
 generalizes!
 
 As mentioned earlier, every MNIST data point has two parts: an image of a
-handwritten digit and a corresponding label. We will call the images "xs" and
-the labels "ys". Both the training set and test set contain xs and ys, for
-example the training images are `mnist.train.images` and the train labels are
-`mnist.train.labels`.
+handwritten digit and a corresponding label. We'll call the images "x"
+and the labels "y". Both the training set and test set contain images and their
+corresponding labels; for example the training images are `mnist.train.images`
+and the training labels are `mnist.train.labels`.
 
 Each image is 28 pixels by 28 pixels. We can interpret this as a big array of
 numbers:
@@ -77,26 +101,26 @@ From this perspective, the MNIST images are just a bunch of points in a
 Flattening the data throws away information about the 2D structure of the image.
 Isn't that bad? Well, the best computer vision methods do exploit this
 structure, and we will in later tutorials. But the simple method we will be
-using here, a softmax regression, won't.
+using here, a softmax regression (defined below), won't.
 
 The result is that `mnist.train.images` is a tensor (an n-dimensional array)
-with a shape of `[55000, 784]`. The first dimension indexes the images and the
-second dimension indexes the pixels in each image. Each entry in the tensor is
-the pixel intensity between 0 and 1, for a particular pixel in a particular
-image.
+with a shape of `[55000, 784]`. The first dimension is an index into the list
+of images and the second dimension is the index for each pixel in each image.
+East entry in the tensor is a pixel intensity between 0 and 1, for a particular
+pixel in a particular image.
 
 <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="../../../images/mnist-train-xs.png">
 </div>
 
-The corresponding labels in MNIST are numbers between 0 and 9, describing
-which digit a given image is of.
-For the purposes of this tutorial, we're going to want our labels
-as "one-hot vectors". A one-hot vector is a vector which is 0 in most
-dimensions, and 1 in a single dimension. In this case, the \\(n\\)th digit will
-be represented as a vector which is 1 in the \\(n\\)th dimensions. For example,
-3 would be \\([0,0,0,1,0,0,0,0,0,0]\\).
-Consequently, `mnist.train.labels` is a
+Each image in MNIST has a corresponding label, a number between 0 and 9
+representing the digit drawn in the image.
+
+For the purposes of this tutorial, we're going to want our labels as "one-hot
+vectors". A one-hot vector is a vector which is 0 in most dimensions, and 1 in a
+single dimension. In this case, the \\(n\\)th digit will be represented as a
+vector which is 1 in the \\(n\\)th dimensions. For example, 3 would be
+\\([0,0,0,1,0,0,0,0,0,0]\\).  Consequently, `mnist.train.labels` is a
 `[55000, 10]` array of floats.
 
 <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
@@ -107,24 +131,26 @@ We're now ready to actually make our model!
 
 ## Softmax Regressions
 
-We know that every image in MNIST is a digit, whether it's a zero or a nine. We
-want to be able to look at an image and give probabilities for it being each
+We know that every image in MNIST is of a handwritten digit between zero and
+nine.  So there are only ten possible things that a given image can be. We want
+to be able to look at an image and give the probabilities for it being each
 digit. For example, our model might look at a picture of a nine and be 80% sure
 it's a nine, but give a 5% chance to it being an eight (because of the top loop)
-and a bit of probability to all the others because it isn't sure.
+and a bit of probability to all the others because isn't 100% sure.
 
 This is a classic case where a softmax regression is a natural, simple model.
 If you want to assign probabilities to an object being one of several different
-things, softmax is the thing to do. Even later on, when we train more
-sophisticated models, the final step will be a layer of softmax.
+things, softmax is the thing to do, because softmax gives us a list of values
+between 0 and 1 that add up to 1. Even later on, when we train more sophisticated
+models, the final step will be a layer of softmax.
 
 A softmax regression has two steps: first we add up the evidence of our input
 being in certain classes, and then we convert that evidence into probabilities.
 
 To tally up the evidence that a given image is in a particular class, we do a
 weighted sum of the pixel intensities. The weight is negative if that pixel
-having a high intensity is evidence against the image being in that class,
-and positive if it is evidence in favor.
+having a high intensity is evidence against the image being in that class, and
+positive if it is evidence in favor.
 
 The following diagram shows the weights one model learned for each of these
 classes. Red represents negative weights, while blue represents positive
@@ -160,18 +186,16 @@ If you expand that equation out, you get:
 
 $$\text{softmax}(x)_i = \frac{\exp(x_i)}{\sum_j \exp(x_j)}$$
 
-But it's often more helpful to think of softmax the first way:
-exponentiating its inputs and then normalizing them.
-The exponentiation means that one more unit of evidence increases the weight
-given to any hypothesis multiplicatively.
-And conversely, having one less unit of evidence means that a
-hypothesis gets a fraction of its earlier weight. No hypothesis ever has zero
-or negative weight. Softmax then normalizes these weights, so that they add up
-to one, forming a valid probability distribution. (To get more intuition about
-the softmax function, check out the
-[section](http://neuralnetworksanddeeplearning.com/chap3.html#softmax)
-on it in Michael Nielsen's book, complete with an interactive visualization.)
-
+But it's often more helpful to think of softmax the first way: exponentiating
+its inputs and then normalizing them.  The exponentiation means that one more
+unit of evidence increases the weight given to any hypothesis multiplicatively.
+And conversely, having one less unit of evidence means that a hypothesis gets a
+fraction of its earlier weight. No hypothesis ever has zero or negative
+weight. Softmax then normalizes these weights, so that they add up to one,
+forming a valid probability distribution. (To get more intuition about the
+softmax function, check out the
+[section](http://neuralnetworksanddeeplearning.com/chap3.html#softmax) on it in
+Michael Nielsen's book, complete with an interactive visualization.)
 
 You can picture our softmax regression as looking something like the following,
 although with a lot more \\(x\\)s. For each output, we compute a weighted sum of
@@ -199,26 +223,26 @@ More compactly, we can just write:
 
 $$y = \text{softmax}(Wx + b)$$
 
+Now let's turn that into something that Tensorflow can use.
 
 ## Implementing the Regression
 
 
 To do efficient numerical computing in Python, we typically use libraries like
-NumPy that do expensive operations such as matrix multiplication outside Python,
-using highly efficient code implemented in another language.
-Unfortunately, there can still be a lot of overhead from switching back to
-Python every operation. This overhead is especially bad if you want to run
-computations on GPUs or in a distributed manner, where there can be a high cost
-to transferring data.
+[NumPy](http://www.numpy.org/) that do expensive operations such as matrix
+multiplication outside Python, using highly efficient code implemented in
+another language.  Unfortunately, there can still be a lot of overhead from
+switching back to Python every operation. This overhead is especially bad if you
+want to run computations on GPUs or in a distributed manner, where there can be
+a high cost to transferring data.
 
-TensorFlow also does its heavy lifting outside python,
-but it takes things a step further to avoid this overhead.
-Instead of running a single expensive operation independently
-from Python, TensorFlow lets us describe a graph of interacting operations that
-run entirely outside Python. (Approaches like this can be seen in a few
-machine learning libraries.)
+TensorFlow also does its heavy lifting outside Python, but it takes things a
+step further to avoid this overhead.  Instead of running a single expensive
+operation independently from Python, TensorFlow lets us describe a graph of
+interacting operations that run entirely outside Python. (Approaches like this
+can be seen in a few machine learning libraries.)
 
-To use TensorFlow, we need to import it.
+To use TensorFlow, first we need to import it.
 
 ```python
 import tensorflow as tf
@@ -239,11 +263,10 @@ this as a 2-D tensor of floating-point numbers, with a shape `[None, 784]`.
 
 We also need the weights and biases for our model. We could imagine treating
 these like additional inputs, but TensorFlow has an even better way to handle
-it: `Variable`.
-A `Variable` is a modifiable tensor that lives in TensorFlow's graph of
-interacting
-operations. It can be used and even modified by the computation. For machine
-learning applications, one generally has the model parameters be `Variable`s.
+it: `Variable`.  A `Variable` is a modifiable tensor that lives in TensorFlow's
+graph of interacting operations. It can be used and even modified by the
+computation. For machine learning applications, one generally has the model
+parameters be `Variable`s.
 
 ```python
 W = tf.Variable(tf.zeros([784, 10]))
@@ -260,17 +283,16 @@ Notice that `W` has a shape of [784, 10] because we want to multiply the
 evidence for the difference classes. `b` has a shape of [10] so we can add it
 to the output.
 
-We can now implement our model. It only takes one line!
+We can now implement our model. It only takes one line to define it!
 
 ```python
 y = tf.nn.softmax(tf.matmul(x, W) + b)
 ```
 
 First, we multiply `x` by `W` with the expression `tf.matmul(x, W)`. This is
-flipped from when we multiplied them in our equation, where we had \\(Wx\\), as a
-small trick
-to deal with `x` being a 2D tensor with multiple inputs. We then add `b`, and
-finally apply `tf.nn.softmax`.
+flipped from when we multiplied them in our equation, where we had \\(Wx\\), as
+a small trick to deal with `x` being a 2D tensor with multiple inputs. We then
+add `b`, and finally apply `tf.nn.softmax`.
 
 That's it. It only took us one line to define our model, after a couple short
 lines of setup. That isn't because TensorFlow is designed to make a softmax
@@ -282,79 +304,80 @@ your computer's CPU, GPUs, and even phones!
 
 ## Training
 
-In order to train our model, we need to define what it means for the  model to
-be good. Well, actually, in machine learning we typically define what it means
-for a model to be bad, called the cost or loss, and then try to minimize how bad
-it is. But the two are equivalent.
+In order to train our model, we need to define what it means for the model to be
+good. Well, actually, in machine learning we typically define what it means for
+a model to be bad. We call this the cost, or the loss, and it represents how far
+off our model is from our desired outcome. We try to minimize that error, and
+the smaller the error margin, the better our model is.
 
-One very common, very nice cost function is "cross-entropy." Surprisingly,
-cross-entropy arises from thinking about information compressing codes in
-information theory but it winds up being an important idea in lots of areas,
-from gambling to machine learning. It's defined:
+One very common, very nice function to determine the loss of a model is called
+"cross-entropy." Cross-entropy arises from thinking about information
+compressing codes in information theory but it winds up being an important idea
+in lots of areas, from gambling to machine learning. It's defined as:
 
 $$H_{y'}(y) = -\sum_i y'_i \log(y_i)$$
 
 Where \\(y\\) is our predicted probability distribution, and \\(y'\\) is the true
-distribution (the one-hot vector we'll input).  In some rough sense, the
+distribution (the one-hot vector with the digit labels).  In some rough sense, the
 cross-entropy is measuring how inefficient our predictions are for describing
 the truth. Going into more detail about cross-entropy is beyond the scope of
 this tutorial, but it's well worth
 [understanding](http://colah.github.io/posts/2015-09-Visual-Information/).
 
-To implement cross-entropy we need to first add a new placeholder to input
-the correct answers:
+To implement cross-entropy we need to first add a new placeholder to input the
+correct answers:
 
 ```python
 y_ = tf.placeholder(tf.float32, [None, 10])
 ```
 
-Then we can implement the cross-entropy, \\(-\sum y'\log(y)\\):
+Then we can implement the cross-entropy function, \\(-\sum y'\log(y)\\):
 
 ```python
 cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
 ```
 
 First, `tf.log` computes the logarithm of each element of `y`. Next, we multiply
-each element of `y_` with the corresponding element of `tf.log(y)`. Then 
-`tf.reduce_sum` adds the elements in the second dimension of y, due to the 
-`reduction_indices=[1]` parameter. Finally,  `tf.reduce_mean` computes the mean
+each element of `y_` with the corresponding element of `tf.log(y)`. Then
+`tf.reduce_sum` adds the elements in the second dimension of y, due to the
+`reduction_indices=[1]` parameter. Finally, `tf.reduce_mean` computes the mean
 over all the examples in the batch.
 
 Now that we know what we want our model to do, it's very easy to have TensorFlow
-train it to do so.
-Because TensorFlow knows the entire graph of your computations, it
-can automatically use the [backpropagation
-algorithm](http://colah.github.io/posts/2015-08-Backprop/)
-to efficiently determine how your variables affect the cost you ask it to
+train it to do so.  Because TensorFlow knows the entire graph of your
+computations, it can automatically use the
+[backpropagation algorithm](http://colah.github.io/posts/2015-08-Backprop/) to
+efficiently determine how your variables affect the loss you ask it to
 minimize. Then it can apply your choice of optimization algorithm to modify the
-variables and reduce the cost.
+variables and reduce the loss.
 
 ```python
 train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
 ```
 
-In this case, we ask TensorFlow to minimize `cross_entropy` using the gradient
-descent algorithm with a learning rate of 0.5. Gradient descent is a simple
-procedure, where TensorFlow simply shifts each variable a little bit in the
-direction that reduces the cost. But TensorFlow also provides
+In this case, we ask TensorFlow to minimize `cross_entropy` using the
+[gradient descent algorithm](https://en.wikipedia.org/wiki/Gradient_descent)
+with a learning rate of 0.5. Gradient descent is a simple procedure, where
+TensorFlow simply shifts each variable a little bit in the direction that
+reduces the cost. But TensorFlow also provides
 [many other optimization algorithms]
 (../../../api_docs/python/train.md#optimizers): using one is as simple as
 tweaking one line.
 
-What TensorFlow actually does here, behind the scenes, is it adds new operations
-to your graph which
-implement backpropagation and gradient descent. Then it gives you back a
-single operation which, when run, will do a step of gradient descent training,
-slightly tweaking your variables to reduce the cost.
+What TensorFlow actually does here, behind the scenes, is to add new operations
+to your graph which implement backpropagation and gradient descent. Then it
+gives you back a single operation which, when run, does a step of gradient
+descent training, slightly tweaking your variables to reduce the loss.
 
-Now we have our model set up to train. One last thing before we launch it,
-we have to add an operation to initialize the variables we created:
+Now we have our model set up to train. One last thing before we launch it, we
+have to create an operation to initialize the variables we created. Note that
+this defines the operation but does not run it yet:
 
 ```python
 init = tf.initialize_all_variables()
 ```
 
-We can now launch the model in a `Session`, and run the operation that
+We can now launch the model in a `Session`, and now we run the operation that
 initializes the variables:
 
 ```python
@@ -374,10 +397,10 @@ Each step of the loop, we get a "batch" of one hundred random data points from
 our training set. We run `train_step` feeding in the batches data to replace
 the `placeholder`s.
 
-Using small batches of random data is called stochastic training -- in
-this case, stochastic gradient descent. Ideally, we'd like to use all our data
-for every step of training because that would give us a better sense of what
-we should be doing, but that's expensive. So, instead, we use a different subset
+Using small batches of random data is called stochastic training -- in this
+case, stochastic gradient descent. Ideally, we'd like to use all our data for
+every step of training because that would give us a better sense of what we
+should be doing, but that's expensive. So, instead, we use a different subset
 every time. Doing this is cheap and has much of the same benefit.
 
 
@@ -414,12 +437,12 @@ print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}
 This should be about 92%.
 
 Is that good? Well, not really. In fact, it's pretty bad. This is because we're
-using a very simple model. With some small changes, we can get to
-97%. The best models can get to over 99.7% accuracy! (For more information, have
-a look at this
+using a very simple model. With some small changes, we can get to 97%. The best
+models can get to over 99.7% accuracy! (For more information, have a look at
+this
 [list of results](http://rodrigob.github.io/are_we_there_yet/build/classification_datasets_results.html).)
 
 What matters is that we learned from this model. Still, if you're feeling a bit
-down about these results, check out [the next tutorial](../../../tutorials/mnist/pros/index.md) where we
-do a lot better, and learn how to build more sophisticated models using
-TensorFlow!
+down about these results, check out
+[the next tutorial](../../../tutorials/mnist/pros/index.md) where we do a lot
+better, and learn how to build more sophisticated models using TensorFlow!
diff --git a/tensorflow/g3doc/tutorials/mnist/download/index.md b/tensorflow/g3doc/tutorials/mnist/download/index.md
deleted file mode 100644
index 16ff9e84227..00000000000
--- a/tensorflow/g3doc/tutorials/mnist/download/index.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# MNIST Data Download
-
-Code: [tensorflow/examples/tutorials/mnist/](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/)
-
-The goal of this tutorial is to show how to download the dataset files required
-for handwritten digit classification using the (classic) MNIST data set.
-
-## Tutorial Files
-
-This tutorial references the following files:
-
-File | Purpose
---- | ---
-[`input_data.py`](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/input_data.py) | The code to download the MNIST dataset for training and evaluation.
-
-## Prepare the Data
-
-MNIST is a classic problem in machine learning. The problem is to look at
-greyscale 28x28 pixel images of handwritten digits and determine which digit
-the image represents, for all the digits from zero to nine.
-
-![MNIST Digits](../../../images/mnist_digits.png "MNIST Digits")
-
-For more information, refer to [Yann LeCun's MNIST page](http://yann.lecun.com/exdb/mnist/)
-or [Chris Olah's visualizations of MNIST](http://colah.github.io/posts/2014-10-Visualizing-MNIST/).
-
-### Download
-
-[Yann LeCun's MNIST page](http://yann.lecun.com/exdb/mnist/)
-also hosts the training and test data for download.
-
-File | Purpose
---- | ---
-[`train-images-idx3-ubyte.gz`](http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz) | training set images - 55000 training images, 5000 validation images
-[`train-labels-idx1-ubyte.gz`](http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz) | training set labels matching the images
-[`t10k-images-idx3-ubyte.gz`](http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz) | test set images - 10000 images
-[`t10k-labels-idx1-ubyte.gz`](http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz) | test set labels matching the images
-
-In the `input_data.py` file, the `maybe_download()` function will ensure these
-files are downloaded into a local data folder for training.
-
-The folder name is specified in a flag variable at the top of the
-`fully_connected_feed.py` file and may be changed to fit your needs.
-
-### Unpack and Reshape
-
-The files themselves are not in any standard image format and are manually
-unpacked (following the instructions available at the website) by the
-`extract_images()` and `extract_labels()` functions in `input_data.py`.
-
-The image data is extracted into a 2d tensor of: `[image index, pixel index]`
-where each entry is the intensity value of a specific pixel in a specific
-image, rescaled from `[0, 255]` to `[0, 1]`.  The "image index" corresponds
-to an image in the dataset, counting up from zero to the size of the dataset.
-And the "pixel index" corresponds to a specific pixel in that image, ranging
-from zero to the number of pixels in the image.
-
-The 60000 examples in the `train-*` files are then split into 55000 examples
-for training and 5000 examples for validation. For all of the 28x28
-pixel greyscale images in the datasets the image size is 784 and so the output
-tensor for the training set images is of shape `[55000, 784]`.
-
-The label data is extracted into a 1d tensor of: `[image index]`
-with the class identifier for each example as the value. For the training set
-labels, this would then be of shape `[55000]`.
-
-### DataSet Object
-
-The underlying code will download, unpack, and reshape images and labels for
-the following datasets:
-
-Dataset | Purpose
---- | ---
-`data_sets.train` | 55000 images and labels, for primary training.
-`data_sets.validation` | 5000 images and labels, for iterative validation of training accuracy.
-`data_sets.test` | 10000 images and labels, for final testing of trained accuracy.
-
-The `read_data_sets()` function will return a dictionary with a `DataSet`
-instance for each of these three sets of data.  The `DataSet.next_batch()`
-method can be used to fetch a tuple consisting of `batch_size` lists of images
-and labels to be fed into the running TensorFlow session.
-
-```python
-images_feed, labels_feed = data_set.next_batch(FLAGS.batch_size)
-```
diff --git a/tensorflow/g3doc/tutorials/mnist/pros/index.md b/tensorflow/g3doc/tutorials/mnist/pros/index.md
index f0bb36220a6..d3a0af6e652 100644
--- a/tensorflow/g3doc/tutorials/mnist/pros/index.md
+++ b/tensorflow/g3doc/tutorials/mnist/pros/index.md
@@ -1,10 +1,9 @@
-#  Deep MNIST for Experts
+# Deep MNIST for Experts
 
 TensorFlow is a powerful library for doing large-scale numerical computation.
 One of the tasks at which it excels is implementing and training deep neural
-networks.
-In this tutorial we will learn the basic building blocks of a TensorFlow model
-while constructing a deep convolutional MNIST classifier.
+networks.  In this tutorial we will learn the basic building blocks of a
+TensorFlow model while constructing a deep convolutional MNIST classifier.
 
 *This introduction assumes familiarity with neural networks and the MNIST
 dataset. If you don't have
@@ -12,6 +11,30 @@ a background with them, check out the
 [introduction for beginners](../beginners/index.md). Be sure to
 [install TensorFlow](../../../get_started/os_setup.md) before starting.*
 
+
+## About this tutorial
+
+The first part of this tutorial explains what is happening in the
+[mnist_softmax.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_softmax.py)
+code, which is a basic implementation of a Tensorflow model.  The second part
+shows some ways to improve the accuracy.
+
+You can copy and paste each code snippet from this tutorial into a Python
+environment, or you can choose to just read through the code.
+
+What we will accomplish in this tutorial:
+
+- Create a softmax regression function that is a model for recognizing MNIST
+  digits, based on looking at every pixel in the image
+
+- Use Tensorflow to train the model to recognize digits by having it "look" at
+  thousands of examples (and run our first Tensorflow session to do so)
+
+- Check the model's accuracy with our test data
+
+- Build, train, and test a multilayer convolutional neural network to improve
+  the results
+
 ## Setup
 
 Before we create our model, we will first load the MNIST dataset, and start a
@@ -19,10 +42,8 @@ TensorFlow session.
 
 ### Load MNIST Data
 
-For your convenience, we've included
-[a script](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/input_data.py)
-which will help you download and import the MNIST dataset. Run the following commands to create a
-directory `'MNIST_data'` in the current folder, the data files will be stored inside that directory.
+If you are copying and pasting in the code from this tutorial, start here with
+these two lines of code which will download and read in the data automatically:
 
 ```python
 from tensorflow.examples.tutorials.mnist import input_data
@@ -30,9 +51,8 @@ mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
 ```
 
 Here `mnist` is a lightweight class which stores the training, validation, and
-testing sets as NumPy arrays.
-It also provides a function for iterating through data minibatches, which we
-will use below.
+testing sets as NumPy arrays.  It also provides a function for iterating through
+data minibatches, which we will use below.
 
 ### Start TensorFlow InteractiveSession
 
@@ -40,17 +60,15 @@ TensorFlow relies on a highly efficient C++ backend to do its computation. The
 connection to this backend is called a session.  The common usage for TensorFlow
 programs is to first create a graph and then launch it in a session.
 
-Here we instead use the convenient `InteractiveSession` class, which
-makes TensorFlow more flexible about how you
-structure your code.
-It allows you to interleave operations which build a
+Here we instead use the convenient `InteractiveSession` class, which makes
+TensorFlow more flexible about how you structure your code.  It allows you to
+interleave operations which build a
 [computation graph](../../../get_started/basic_usage.md#the-computation-graph)
-with ones that run the graph.
-This is particularly convenient when working in interactive contexts like
-IPython.
-If you are not using an `InteractiveSession`, then you should build
-the entire computation graph before starting a session and [launching the
-graph](../../../get_started/basic_usage.md#launching-the-graph-in-a-session).
+with ones that run the graph.  This is particularly convenient when working in
+interactive contexts like IPython.  If you are not using an
+`InteractiveSession`, then you should build the entire computation graph before
+starting a session and
+[launching the graph](../../../get_started/basic_usage.md#launching-the-graph-in-a-session).
 
 ```python
 import tensorflow as tf
@@ -60,19 +78,18 @@ sess = tf.InteractiveSession()
 #### Computation Graph
 
 To do efficient numerical computing in Python, we typically use libraries like
-NumPy that do expensive operations such as matrix multiplication outside Python,
-using highly efficient code implemented in another language.
-Unfortunately, there can still be a lot of overhead from switching back to
-Python every operation. This overhead is especially bad if you want to run
-computations on GPUs or in a distributed manner, where there can be a high cost
-to transferring data.
+[NumPy](http://www.numpy.org/) that do expensive operations such as matrix
+multiplication outside Python, using highly efficient code implemented in
+another language.  Unfortunately, there can still be a lot of overhead from
+switching back to Python every operation. This overhead is especially bad if you
+want to run computations on GPUs or in a distributed manner, where there can be
+a high cost to transferring data.
 
-TensorFlow also does its heavy lifting outside Python,
-but it takes things a step further to avoid this overhead.
-Instead of running a single expensive operation independently
-from Python, TensorFlow lets us describe a graph of interacting operations that
-run entirely outside Python.
-This approach is similar to that used in Theano or Torch.
+TensorFlow also does its heavy lifting outside Python, but it takes things a
+step further to avoid this overhead.  Instead of running a single expensive
+operation independently from Python, TensorFlow lets us describe a graph of
+interacting operations that run entirely outside Python.  This approach is
+similar to that used in Theano or Torch.
 
 The role of the Python code is therefore to build this external computation
 graph, and to dictate which parts of the computation graph should be run. See
@@ -102,59 +119,58 @@ Here `x` and `y_` aren't specific values. Rather, they are each a `placeholder`
 -- a value that we'll input when we ask TensorFlow to run a computation.
 
 The input images `x` will consist of a 2d tensor of floating point numbers.
-Here we assign it a `shape` of `[None, 784]`, where `784` is the dimensionality of
-a single flattened MNIST image, and `None` indicates that the first dimension,
-corresponding to the batch size, can be of any size.
-The target output classes `y_` will also consist of a 2d tensor,
-where each row is a one-hot 10-dimensional vector indicating
-which digit class the corresponding MNIST image belongs to.
+Here we assign it a `shape` of `[None, 784]`, where `784` is the dimensionality
+of a single flattened 28 by 28 pixel MNIST image, and `None` indicates that the
+first dimension, corresponding to the batch size, can be of any size.  The
+target output classes `y_` will also consist of a 2d tensor, where each row is a
+one-hot 10-dimensional vector indicating which digit class (zero through nine)
+the corresponding MNIST image belongs to.
 
 The `shape` argument to `placeholder` is optional, but it allows TensorFlow
 to automatically catch bugs stemming from inconsistent tensor shapes.
 
 ### Variables
 
-We now define the weights `W` and biases `b` for our model. We could imagine treating
-these like additional inputs, but TensorFlow has an even better way to handle
-them: `Variable`.
-A `Variable` is a value that lives in TensorFlow's computation graph.
-It can be used and even modified by the computation. In machine
-learning applications, one generally has the model parameters be `Variable`s.
+We now define the weights `W` and biases `b` for our model. We could imagine
+treating these like additional inputs, but TensorFlow has an even better way to
+handle them: `Variable`.  A `Variable` is a value that lives in TensorFlow's
+computation graph.  It can be used and even modified by the computation. In
+machine learning applications, one generally has the model parameters be
+`Variable`s.
 
 ```python
 W = tf.Variable(tf.zeros([784,10]))
 b = tf.Variable(tf.zeros([10]))
 ```
 
-We pass the initial value for each parameter in the call to `tf.Variable`.
-In this case, we initialize both `W` and `b` as tensors full of
-zeros. `W` is a 784x10 matrix (because we have 784 input features
-and 10 outputs) and `b` is a 10-dimensional vector (because we have 10 classes).
+We pass the initial value for each parameter in the call to `tf.Variable`.  In
+this case, we initialize both `W` and `b` as tensors full of zeros. `W` is a
+784x10 matrix (because we have 784 input features and 10 outputs) and `b` is a
+10-dimensional vector (because we have 10 classes).
 
 Before `Variable`s can be used within a session, they must be initialized using
-that session.
-This step takes the initial values (in this case tensors full of zeros) that
-have already been specified, and assigns them to each `Variable`. This can be
-done for all `Variables` at once.
+that session.  This step takes the initial values (in this case tensors full of
+zeros) that have already been specified, and assigns them to each
+`Variable`. This can be done for all `Variables` at once:
 
 ```python
 sess.run(tf.initialize_all_variables())
 ```
 
-### Predicted Class and Cost Function
+### Predicted Class and Loss Function
 
-We can now implement our regression model. It only takes one line!
-We multiply the vectorized input images `x` by the weight matrix `W`, add
-the bias `b`, and compute the softmax probabilities that are assigned to each
-class.
+We can now implement our regression model. It only takes one line!  We multiply
+the vectorized input images `x` by the weight matrix `W`, add the bias `b`, and
+compute the softmax probabilities that are assigned to each class.
 
 ```python
 y = tf.nn.softmax(tf.matmul(x,W) + b)
 ```
 
-The cost function to be minimized during training can be specified just as
-easily. Our cost function will be the cross-entropy between the target and the
-model's prediction.
+We can specify a loss function just as easily. Loss indicates how bad the
+model's prediction was on a single example; we try to minimize that while
+training across all the examples. Here, our loss function is the cross-entropy
+between the target and the model's prediction:
 
 ```python
 cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
@@ -165,16 +181,14 @@ the average over these sums.
 
 ## Train the Model
 
-Now that we have defined our model and training cost function, it is
-straightforward to train using TensorFlow.
-Because TensorFlow knows the entire computation graph, it
-can use automatic differentiation to find the gradients of the cost with
-respect to each of the variables.
-TensorFlow has a variety of
-[builtin optimization algorithms]
-(../../../api_docs/python/train.md#optimizers).
-For this example, we will use steepest gradient descent, with a step length of
-0.5, to descend the cross entropy.
+Now that we have defined our model and training loss function, it is
+straightforward to train using TensorFlow.  Because TensorFlow knows the entire
+computation graph, it can use automatic differentiation to find the gradients of
+the loss with respect to each of the variables.  TensorFlow has a variety of
+[built-in optimization algorithms]
+(../../../api_docs/python/train.md#optimizers).  For this example, we will use
+steepest gradient descent, with a step length of 0.5, to descend the cross
+entropy.
 
 ```python
 train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
@@ -184,9 +198,9 @@ What TensorFlow actually did in that single line was to add new operations to
 the computation graph. These operations included ones to compute gradients,
 compute parameter update steps, and apply update steps to the parameters.
 
-The returned operation `train_step`, when run, will apply the gradient
-descent updates to the parameters. Training the model can therefore be
-accomplished by repeatedly running `train_step`.
+The returned operation `train_step`, when run, will apply the gradient descent
+updates to the parameters. Training the model can therefore be accomplished by
+repeatedly running `train_step`.
 
 ```python
 for i in range(1000):
@@ -194,22 +208,21 @@ for i in range(1000):
   train_step.run(feed_dict={x: batch[0], y_: batch[1]})
 ```
 
-Each training iteration we load 100 training examples. We then run the
+We load 100 training examples in each training iteration. We then run the
 `train_step` operation, using `feed_dict` to replace the `placeholder` tensors
-`x` and `y_` with the training examples.
-Note that you can replace any tensor in your computation graph using `feed_dict`
--- it's not restricted to just `placeholder`s.
+`x` and `y_` with the training examples.  Note that you can replace any tensor
+in your computation graph using `feed_dict` -- it's not restricted to just
+`placeholder`s.
 
 ### Evaluate the Model
 
 How well did our model do?
 
-First we'll figure out where we predicted the correct label. `tf.argmax`
-is an extremely useful function which gives you the index of the highest entry
-in a tensor along some axis. For example, `tf.argmax(y,1)` is the label our
-model thinks is most likely for each input, while `tf.argmax(y_,1)` is the
-true label. We can use `tf.equal` to check if our prediction matches the
-truth.
+First we'll figure out where we predicted the correct label. `tf.argmax` is an
+extremely useful function which gives you the index of the highest entry in a
+tensor along some axis. For example, `tf.argmax(y,1)` is the label our model
+thinks is most likely for each input, while `tf.argmax(y_,1)` is the true
+label. We can use `tf.equal` to check if our prediction matches the truth.
 
 ```python
 correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
@@ -241,10 +254,11 @@ to around 99.2% accuracy -- not state of the art, but respectable.
 
 To create this model, we're going to need to create a lot of weights and biases.
 One should generally initialize weights with a small amount of noise for
-symmetry breaking, and to prevent 0 gradients. Since we're using ReLU neurons,
-it is also good practice to initialize them with a slightly positive initial
-bias to avoid "dead neurons". Instead of doing this repeatedly while we build
-the model, let's create two handy functions to do it for us.
+symmetry breaking, and to prevent 0 gradients. Since we're using
+[ReLU](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) neurons, it is
+also good practice to initialize them with a slightly positive initial bias to
+avoid "dead neurons". Instead of doing this repeatedly while we build the model,
+let's create two handy functions to do it for us.
 
 ```python
 def weight_variable(shape):
@@ -362,13 +376,21 @@ y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
 
 ### Train and Evaluate the Model
 
-How well does this model do?
-To train and evaluate it we will use code that is nearly identical to that for
-the simple one layer SoftMax network above.
-The differences are that: we will replace the steepest gradient descent
-optimizer with the more sophisticated ADAM optimizer; we will include the
-additional parameter `keep_prob` in `feed_dict` to control the dropout rate;
-and we will add logging to every 100th iteration in the training process.
+How well does this model do? To train and evaluate it we will use code that is
+nearly identical to that for the simple one layer SoftMax network above.
+
+The differences are that: 
+
+- We will replace the steepest gradient descent optimizer with the more
+  sophisticated ADAM optimizer.
+
+- We will include the additional parameter `keep_prob` in `feed_dict` to control
+  the dropout rate.
+
+- We will add logging to every 100th iteration in the training process.
+
+Feel free to go ahead and run this code, but it does 20,000 training iterations
+and may take a while (possibly up to half an hour), depending on your processor.
 
 ```python
 cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv), reduction_indices=[1]))
diff --git a/tensorflow/g3doc/tutorials/mnist/tf/index.md b/tensorflow/g3doc/tutorials/mnist/tf/index.md
index 9d83393dc0c..c7d5eec401b 100644
--- a/tensorflow/g3doc/tutorials/mnist/tf/index.md
+++ b/tensorflow/g3doc/tutorials/mnist/tf/index.md
@@ -58,9 +58,6 @@ Dataset | Purpose
 `data_sets.validation` | 5000 images and labels, for iterative validation of training accuracy.
 `data_sets.test` | 10000 images and labels, for final testing of trained accuracy.
 
-For more information about the data, please read the [Download](../../../tutorials/mnist/download/index.md)
-tutorial.
-
 ### Inputs and Placeholders
 
 The `placeholder_inputs()` function creates two [`tf.placeholder`](../../../api_docs/python/io_ops.md#placeholder)
diff --git a/tensorflow/g3doc/tutorials/monitors/index.md b/tensorflow/g3doc/tutorials/monitors/index.md
new file mode 100644
index 00000000000..d1edf79d8bb
--- /dev/null
+++ b/tensorflow/g3doc/tutorials/monitors/index.md
@@ -0,0 +1,350 @@
+# Logging and Monitoring Basics with tf.contrib.learn
+
+When training a model, it’s often valuable to track and evaluate progress in
+real time. In this tutorial, you’ll learn how to use TensorFlow’s logging
+capabilities and the `Monitor` API to audit the in-progress training of a neural
+network classifier for categorizing irises. This tutorial builds on the code
+developed in [tf.contrib.learn Quickstart](../tflearn/index.md)
+so if you haven't yet completed that tutorial, you may want to explore it first,
+especially if you're looking for an intro/refresher on tf.contrib.learn basics.
+
+## Setup {#setup}
+
+For this tutorial, you'll be building upon the following code from
+[tf.contrib.learn Quickstart](../tflearn/index.md):
+
+```python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import numpy as np
+
+# Data sets
+IRIS_TRAINING = "iris_training.csv"
+IRIS_TEST = "iris_test.csv"
+
+# Load datasets.
+training_set = tf.contrib.learn.datasets.base.load_csv(filename=IRIS_TRAINING,
+                                                       target_dtype=np.int)
+test_set = tf.contrib.learn.datasets.base.load_csv(filename=IRIS_TEST,
+                                                   target_dtype=np.int)
+
+# Build 3 layer DNN with 10, 20, 10 units respectively.
+classifier = tf.contrib.learn.DNNClassifier(hidden_units=[10, 20, 10],
+                                            n_classes=3,
+                                            model_dir="/tmp/iris_model")
+
+# Fit model.
+classifier.fit(x=training_set.data,
+               y=training_set.target,
+               steps=2000)
+
+# Evaluate accuracy.
+accuracy_score = classifier.evaluate(x=test_set.data,
+                                     y=test_set.target)["accuracy"]
+print('Accuracy: {0:f}'.format(accuracy_score))
+
+# Classify two new flower samples.
+new_samples = np.array(
+    [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
+y = classifier.predict(new_samples)
+print('Predictions: {}'.format(str(y)))
+```
+
+Copy the above code into a file, and download the corresponding [training]
+(http://download.tensorflow.org/data/iris_training.csv) and [test]
+(http://download.tensorflow.org/data/iris_test.csv) data sets to the same
+directory.
+
+In the following sections, you'll progressively make updates to the above code
+to add logging and monitoring capabilities. Final code incorporating all updates
+is [available for download here](../../../examples/tutorials/monitors/iris_monitors.py).
+
+## Overview
+
+The [tf.contrib.learn Quickstart tutorial](../tflearn/index.md)
+walked through how to implement a neural net classifier to categorize Iris
+examples into one of three species.
+
+But when [the code](#setup) from this tutorial is run, the output contains no
+logging tracking how model training is progressing&mdash;only the results 
+of the `print` statements that were included:
+
+```none
+Accuracy: 0.933333
+Predictions: [1 2]
+```
+
+Without any logging, model training feels like a bit of a black box; you can't
+see what's happening as TensorFlow steps through gradient descent, get a sense
+of whether the model is converging appropriately, or audit to determine whether
+[early stopping](https://en.wikipedia.org/wiki/Early_stopping) might be
+appropriate.
+
+One way to address this problem would be to split model training into multiple
+`fit` calls with smaller numbers of steps in order to evaluate accuracy more
+progressively. However, this is not recommended practice, as it greatly slows down model
+training. Fortunately, tf.contrib.learn offers another solution: a [Monitor API]
+(../../api_docs/python/contrib.learn.monitors.md) designed to help you log metrics
+and evaluate your model while training is in progress. In the following sections,
+you'll learn how to enable logging in TensorFlow, set up a ValidationMonitor to do
+streaming evaluations, and visualize your metrics using TensorBoard.
+
+## Enabling Logging with TensorFlow
+
+TensorFlow uses five different levels for log messages. In order of ascending
+severity, they are `DEBUG`, `INFO`, `WARN`, `ERROR`, and `FATAL`. When you
+configure logging at any of these levels, TensorFlow will output all log
+messages corresponding to that level and all levels of higher severity. For
+example, if you set a logging level of `ERROR`, you'll get log output containing
+`ERROR` and `FATAL` messages, and if you set a level of `DEBUG`, you'll get log
+messages from all five levels.
+
+By default, TensorFlow is configured at a logging level of `WARN`, but when
+tracking model training, you'll want to adjust the level to `INFO`, which will
+provide additional feedback as `fit` operations are in progress.
+
+Add the following line to the beginning of your code (right after your
+`import`s):
+
+```python
+tf.logging.set_verbosity(tf.logging.INFO)
+```
+
+Now when you run the code, you'll see additional log output like the following:
+
+```none
+INFO:tensorflow:Training steps [0,200)
+INFO:tensorflow:global_step/sec: 0
+INFO:tensorflow:Step 1: loss_1:0 = 1.48073
+INFO:tensorflow:training step 100, loss = 0.19847 (0.001 sec/batch).
+INFO:tensorflow:Step 101: loss_1:0 = 0.192693
+INFO:tensorflow:Step 200: loss_1:0 = 0.0958682
+INFO:tensorflow:training step 200, loss = 0.09587 (0.003 sec/batch).
+```
+
+With `INFO`-level logging, tf.contrib.learn automatically outputs [training-loss
+metrics](https://en.wikipedia.org/wiki/Loss_function) to stderr after every 100
+steps.
+
+## Configuring a ValidationMonitor for Streaming Evaluation
+
+Logging training loss is helpful to get a sense whether your model is
+converging, but what if you want further insight into what's happening during
+training? tf.contrib.learn provides several high-level `Monitor`s you can attach
+to your `fit` operations to further track metrics and/or debug lower-level
+TensorFlow operations during model training, including:
+
+| Monitor             | Description                                           |
+| ------------------- | ----------------------------------------------------- |
+| `CaptureVariable`   | Saves a specified variable's values into a collection |
+:                     : at every _n_ steps of training                        :
+| `PrintTensor`       | Logs a specified tensor's values at every _n_ steps   |
+:                     : of training                                           :
+| `SummarySaver`      | Saves [`Summary`]                                     |
+:                     : (../../api_docs/python/train.md#summary-operations) :
+:                     : [protocol buffers]                                    :
+:                     : (https\://developers.google.com/protocol-buffers/)    :
+:                     : for a given tensor using a [`SummaryWriter`]          :
+:                     : (../../api_docs/python/train.md#SummaryWriter) at   :
+:                     : every _n_ steps of training                           :
+| `ValidationMonitor` | Logs a specified set of evaluation metrics at every   |
+:                     : _n_ steps of training, and, if desired, implements    :
+:                     : early stopping under certain conditions               :
+
+### Evaluating Every *N* Steps
+
+For the Iris neural network classifier, while logging training loss, you might
+also want to simultaneously evaluate against test data to see how well the model
+is generalizing. You can accomplish this by configuring a `ValidationMonitor`
+with the test data (`test_set.data` and `test_set.target`), and setting how often to evaluate
+with `every_n_steps`. The default value of `every_n_steps` is `100`; here, set
+`every_n_steps` to `50` to evaluate after every 50 steps of model training:
+
+```python
+validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
+    test_set.data,
+    test_set.target,
+    every_n_steps=50)
+```
+
+Place this code right before the line instantiating the `classifier`.
+
+`ValidationMonitor`s rely on saved checkpoints to perform evaluation operations,
+so you'll want to modify instantiation of the `classifier` to add a
+[`RunConfig`]
+(../../api_docs/python/contrib.learn.md#RunConfig)
+that includes `save_checkpoints_secs`, which specifies how many seconds should
+elapse between checkpoint saves during training. Because the Iris data set is
+quite small, and thus trains quickly, it makes sense to set
+`save_checkpoints_secs` to 1 (saving a checkpoint every second) to ensure a
+sufficient number of checkpoints:
+
+```python
+classifier = tf.contrib.learn.DNNClassifier(hidden_units=[10, 20, 10],
+                                            n_classes=3,
+                                            model_dir="/tmp/iris_model",
+                                            config=tf.contrib.learn.RunConfig(
+                                                save_checkpoints_secs=1))
+```
+
+NOTE: The `model_dir` parameter specifies an explicit directory
+(`/tmp/iris_model`) for model data to be stored; this directory path will be
+easier to reference later on than an autogenerated one. Each time you run the
+code, any existing data in `/tmp/iris_model` will be loaded, and model training
+will continue where it left off in the last run (e.g., running the script twice
+in succession will execute 4000 steps during training&mdash;2000 during each
+`fit` operation). To start over model training from scratch, delete
+`/tmp/iris_model` before running the code.
+
+Finally, to attach your `validation_monitor`, update the `fit` call to include a
+`monitors` param, which takes a list of all monitors to run during model
+training:
+
+```python
+classifier.fit(x=training_set.data,
+               y=training_set.target,
+               steps=2000,
+               monitors=[validation_monitor])
+```
+
+Now, when you rerun the code, you should see validation metrics in your log
+output, e.g.:
+
+```none
+INFO:tensorflow:Validation (step 50): loss = 1.71139, global_step = 0, accuracy = 0.266667
+...
+INFO:tensorflow:Validation (step 300): loss = 0.0714158, global_step = 268, accuracy = 0.966667
+...
+INFO:tensorflow:Validation (step 1750): loss = 0.0574449, global_step = 1729, accuracy = 0.966667
+```
+
+### Customizing the Evaluation Metrics
+
+By default, if no evaluation metrics are specified, `ValidationMonitor` will log
+both [loss](https://en.wikipedia.org/wiki/Loss_function) and accuracy, but you
+can customize the list of metrics that will be run every 50 steps. The
+[tf.contrib.metrics module](../../api_docs/python/contrib.metrics.md) provides
+a variety of additional metric functions for classification models that you can
+use out of the box with `ValidationMonitor`, including
+`streaming_precision` and `streaming_recall`. To specify the exact metrics you'd
+like to run in each evaluation pass, add a `metrics` param to the
+`ValidationMonitor` constructor. `metrics` takes a dict of key/value pairs,
+where each key is the name you'd like logged for the metric, and the
+corresponding value is the function that calculates it.
+
+Revise the `ValidationMonitor` constructor as follows to add logging for
+precision and recall, in addition to accuracy (loss is always logged, and
+doesn't need to be explicity specified):
+
+```python
+validation_metrics = {"accuracy": tf.contrib.metrics.streaming_accuracy,
+                      "precision": tf.contrib.metrics.streaming_precision,
+                      "recall": tf.contrib.metrics.streaming_recall}
+validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
+    test_set.data,
+    test_set.target,
+    every_n_steps=50,
+    metrics=validation_metrics)
+```
+
+Rerun the code again, and you should see precision and recall included in your
+log output, e.g.:
+
+```none
+INFO:tensorflow:Validation (step 50): recall = 0.0, accuracy = 0.266667, global_step = 0, precision = 0.0, loss = 1.71139
+...
+INFO:tensorflow:Validation (step 150): recall = 1.0, accuracy = 0.966667, global_step = 132, precision = 1.0, loss = 0.157797
+...
+INFO:tensorflow:Validation (step 1600): recall = 1.0, accuracy = 0.966667, global_step = 1589, precision = 1.0, loss = 0.055873
+```
+
+### Early Stopping with ValidationMonitor
+
+Note that in the above log output, by step 150, the model has already achieved
+precision and recall rates of 1.0. This raises the question as to whether model
+training could benefit from [early stopping]
+(https://en.wikipedia.org/wiki/Early_stopping).
+
+In addition to logging eval metrics, `ValidationMonitor`s make it easy to
+implement early stopping when specified conditions are met, via three params:
+
+| Param                            | Description                               |
+| -------------------------------- | ----------------------------------------- |
+| `early_stopping_metric`          | Metric that triggers early stopping       |
+:                                  : (e.g., loss or accuracy) under conditions :
+:                                  : specified in `early_stopping_rounds` and  :
+:                                  : `early_stopping_metric_minimize`. Default :
+:                                  : is `"loss"`.                              :
+| `early_stopping_metric_minimize` | `True` if desired model behavior is to    |
+:                                  : minimize the value of                     :
+:                                  : `early_stopping_metric`; `False` if       :
+:                                  : desired model behavior is to maximize the :
+:                                  : value of `early_stopping_metric`. Default :
+:                                  : is `True`.                                :
+| `early_stopping_rounds`          | Sets a number of steps during which if    |
+:                                  : the `early_stopping_metric` does not      :
+:                                  : decrease (if                              :
+:                                  : `early_stopping_metric_minimize` is       :
+:                                  : `True`) or increase (if                   :
+:                                  : `early_stopping_metric_minimize` is       :
+:                                  : False), training will be stopped. Default :
+:                                  : is `None`, which means early stopping     :
+:                                  : will never occur.                         :
+
+The following revision to the `ValidationMonitor` constructor specifies that if
+loss (`early_stopping_metric="loss"`) does not decrease
+(`early_stopping_metric_minimize=True`) over a period of 200 steps
+(`early_stopping_rounds=200`), model training will stop immediately at that
+point, and not complete the full 2000 steps specified in `fit`:
+
+```python
+validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
+    test_set.data,
+    test_set.target,
+    every_n_steps=50,
+    metrics=validation_metrics,
+    early_stopping_metric="loss",
+    early_stopping_metric_minimize=True,
+    early_stopping_rounds=200)
+```
+
+Rerun the code to see if model training stops early:
+
+```none
+...
+INFO:tensorflow:Validation (step 1450): recall = 1.0, accuracy = 0.966667, global_step = 1431, precision = 1.0, loss = 0.0550445
+INFO:tensorflow:Stopping. Best step: 1150 with loss = 0.0506100878119.
+```
+
+Indeed, here training stops at step 1450, indicating that for the past 200
+steps, loss did not decrease, and that overall, step 1150 produced the smallest
+loss value against the test data set. This suggests that additional calibration
+of hyperparameters by decreasing the step count might further improve the model.
+
+## Visualizing Log Data with TensorBoard
+
+Reading through the log produced by `ValidationMonitor` provides plenty of raw
+data on model performance during training, but it may also be helpful to see
+visualizations of this data to get further insight into trends&mdash;for
+example, how accuracy is changing over step count. You can use TensorBoard (a
+separate program packaged with TensorFlow) to plot graphs like this by setting
+the `logdir` command-line argument to the directory where you saved your model
+training data (here, `/tmp/iris_model`). Run the following on your command line:
+
+<pre><strong>$ tensorboard --logdir=/tmp/iris_model/</strong>
+Starting TensorBoard 22 on port 6006
+(You can navigate to http://0.0.0.0:6006)</pre>
+
+Then load the provided URL (here, `http://0.0.0.0:6006`) in your browser. If you
+click on the accuracy field, you'll see an image like the following, which shows
+accuracy plotted against step count:
+
+![Accuracy over step count in TensorBoard]
+(../../images/validation_monitor_tensorboard_accuracy.png "Accuracy over step count in TensorBoard")
+
+For more on using TensorBoard, see [TensorBoard: Visualizing Learning]
+(../../how_tos/summaries_and_tensorboard/index.md)
+and [TensorBoard: Graph Visualization](../../how_tos/graph_viz/index.md).
diff --git a/tensorflow/g3doc/tutorials/recurrent/index.md b/tensorflow/g3doc/tutorials/recurrent/index.md
index 52155633329..82b159c20ab 100644
--- a/tensorflow/g3doc/tutorials/recurrent/index.md
+++ b/tensorflow/g3doc/tutorials/recurrent/index.md
@@ -155,8 +155,9 @@ the second and so on.
 We have a class called `MultiRNNCell` that makes the implementation seamless:
 
 ```python
-lstm = rnn_cell.BasicLSTMCell(lstm_size)
-stacked_lstm = rnn_cell.MultiRNNCell([lstm] * number_of_layers)
+lstm = rnn_cell.BasicLSTMCell(lstm_size, state_is_tuple=False)
+stacked_lstm = rnn_cell.MultiRNNCell([lstm] * number_of_layers,
+    state_is_tuple=False)
 
 initial_state = state = stacked_lstm.zero_state(batch_size, tf.float32)
 for i in range(num_steps):
diff --git a/tensorflow/g3doc/tutorials/tflearn/index.md b/tensorflow/g3doc/tutorials/tflearn/index.md
index 88f7da506ab..4b03801fe8a 100644
--- a/tensorflow/g3doc/tutorials/tflearn/index.md
+++ b/tensorflow/g3doc/tutorials/tflearn/index.md
@@ -25,6 +25,10 @@ started with this tutorial.
 Here is the full code for our neural network:
 
 ```python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import tensorflow as tf
 import numpy as np
 
@@ -33,40 +37,56 @@ IRIS_TRAINING = "iris_training.csv"
 IRIS_TEST = "iris_test.csv"
 
 # Load datasets.
-training_set = tf.contrib.learn.datasets.base.load_csv(filename=IRIS_TRAINING, target_dtype=np.int)
-test_set = tf.contrib.learn.datasets.base.load_csv(filename=IRIS_TEST, target_dtype=np.int)
-
-x_train, x_test, y_train, y_test = training_set.data, test_set.data, \
-  training_set.target, test_set.target
+training_set = tf.contrib.learn.datasets.base.load_csv(filename=IRIS_TRAINING,
+                                                       target_dtype=np.int)
+test_set = tf.contrib.learn.datasets.base.load_csv(filename=IRIS_TEST,
+                                                   target_dtype=np.int)
 
 # Build 3 layer DNN with 10, 20, 10 units respectively.
-classifier = tf.contrib.learn.DNNClassifier(hidden_units=[10, 20, 10], n_classes=3)
+classifier = tf.contrib.learn.DNNClassifier(hidden_units=[10, 20, 10],
+                                            n_classes=3,
+                                            model_dir="/tmp/iris_model")
 
 # Fit model.
-classifier.fit(x=x_train, y=y_train, steps=200)
+classifier.fit(x=training_set.data, 
+               y=training_set.target, 
+               steps=2000)
 
 # Evaluate accuracy.
-accuracy_score = classifier.evaluate(x=x_test, y=y_test)["accuracy"]
+accuracy_score = classifier.evaluate(x=test_set.data,
+                                     y=test_set.target)["accuracy"]
 print('Accuracy: {0:f}'.format(accuracy_score))
 
 # Classify two new flower samples.
 new_samples = np.array(
     [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
 y = classifier.predict(new_samples)
-print ('Predictions: {}'.format(str(y)))
+print('Predictions: {}'.format(str(y)))
 ```
 
 The following sections walk through the code in detail.
 
 ## Load the Iris CSV data to TensorFlow
 
-The [Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set) 
-contains 150 rows of data, comprising 50 samples from each of three related 
-Iris species: *Iris setosa*, *Iris virginica*, and *Iris versicolor*. Each row 
-contains the following data for each flower sample: [sepal](https://en.wikipedia.org/wiki/Sepal) 
-length, sepal width, [petal](https://en.wikipedia.org/wiki/Petal) length, petal width,
-and flower species. Flower species are represented as integers, with 0 denoting *Iris
-setosa*, 1 denoting *Iris versicolor*, and 2 denoting *Iris virginica*.
+The [Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set) contains
+150 rows of data, comprising 50 samples from each of three related Iris species:
+*Iris setosa*, *Iris virginica*, and *Iris versicolor*.
+
+![Petal geometry compared for three iris species: Iris setosa, Iris virginica,
+and Iris versicolor](../../images/iris_three_species.jpg) **From left to right,
+[*Iris setosa*](https://commons.wikimedia.org/w/index.php?curid=170298) (by
+[Radomil](https://commons.wikimedia.org/wiki/User:Radomil), CC BY-SA 3.0),
+[*Iris versicolor*](https://commons.wikimedia.org/w/index.php?curid=248095) (by
+[Dlanglois](https://commons.wikimedia.org/wiki/User:Dlanglois), CC BY-SA 3.0),
+and [*Iris virginica*](https://www.flickr.com/photos/33397993@N05/3352169862)
+(by [Frank Mayfield](https://www.flickr.com/photos/33397993@N05), CC BY-SA
+2.0).**
+
+Each row contains the following data for each flower sample: [sepal]
+(https://en.wikipedia.org/wiki/Sepal) length, sepal width, [petal]
+(https://en.wikipedia.org/wiki/Petal) length, petal width, and flower species.
+Flower species are represented as integers, with 0 denoting *Iris setosa*, 1
+denoting *Iris versicolor*, and 2 denoting *Iris virginica*.
 
 Sepal Length | Sepal Width | Petal Length | Petal Width | Species
 :----------- | :---------- | :----------- | :---------- | :------
@@ -82,17 +102,19 @@ Sepal Length | Sepal Width | Petal Length | Petal Width | Species
 6.2          | 3.4         | 5.4          | 2.3         | 2
 5.9          | 3.0         | 5.1          | 1.8         | 2
 
-<!-- TODO: The rest of this section presumes that CSVs will live in same
-directory as tutorial examples; if not, update links and code --> For this
-tutorial, the Iris data has been randomized and split into two separate CSVs:
-a training set of 120 samples
-([iris_training.csv](http://download.tensorflow.org/data/iris_training.csv)).
-and a test set of 30 samples
-([iris_test.csv](http://download.tensorflow.org/data/iris_test.csv)).
+For this tutorial, the Iris data has been randomized and split into two separate
+CSVs: a training set of 120 samples ([iris_training.csv]
+(http://download.tensorflow.org/data/iris_training.csv)). and a test set of 30
+samples ([iris_test.csv](http://download.tensorflow.org/data/iris_test.csv)).
+Place these files in the same directory as your Python code.
 
 To get started, first import TensorFlow and numpy:
 
 ```python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import tensorflow as tf
 import numpy as np
 ```
@@ -114,94 +136,96 @@ IRIS_TRAINING = "iris_training.csv"
 IRIS_TEST = "iris_test.csv"
 
 # Load datasets.
-training_set = tf.contrib.learn.datasets.base.load_csv(filename=IRIS_TRAINING, target_dtype=np.int)
-test_set = tf.contrib.learn.datasets.base.load_csv(filename=IRIS_TEST, target_dtype=np.int)
+training_set = tf.contrib.learn.datasets.base.load_csv(filename=IRIS_TRAINING,
+                                                       target_dtype=np.int)
+test_set = tf.contrib.learn.datasets.base.load_csv(filename=IRIS_TEST,
+                                                   target_dtype=np.int)
 ```
 
-Next, assign variables to the feature data and target values: `x_train` for
-training-set feature data, `x_test` for test-set feature data, `y_train` for
-training-set target values, and `y_test` for test-set target values. `Dataset`s
-in tf.contrib.learn are [named tuples](https://docs.python.org/2/library/collections.h
-tml#collections.namedtuple), and you can access feature data and target values
-via the `data` and `target` fields, respectively:
+`Dataset`s in tf.contrib.learn are [named tuples]
+(https://docs.python.org/2/library/collections.html#collections.namedtuple),
+and you can access feature data and target values via the `data` and 
+`target` fields. Here, `training_set.data` and `training_set.target` contain
+the feature data and target values for the training set, respectively, and
+`test_set.data` and `test_set.target` contain feature data 
+and target values for the test set.
 
-```python
-x_train, x_test, y_train, y_test = training_set.data, test_set.data, \
-  training_set.target, test_set.target
-```
-
-Later on, in "Fit the DNNClassifier to the Iris Training Data," you'll use
-`x_train` and `y_train` to  train your model, and in "Evaluate Model
-Accuracy", you'll use `x_test` and `y_test`. But first, you'll construct your
+Later on, in ["Fit the DNNClassifier to the Iris Training Data,"]
+(#fit-dnnclassifier) you'll use `training_set.data` and `training_set.target`
+to train your model, and in ["Evaluate Model Accuracy,"](#evaluate-accuracy)
+you'll use `test_set.data` and `test_set.target`. But first, you'll construct your
 model in the next section.
 
 ## Construct a Deep Neural Network Classifier
 
-tf.contrib.learn offers a variety of predefined models, called [`Estimator`s
-](../../api_docs/python/contrib.learn.html#estimators),  which you can use "out
-of the box" to run training and evaluation operations on your data.  Here,
-you'll configure a Deep Neural Network Classifier model to fit the Iris data.
-Using tf.contrib.learn, you can instantiate your
-[`DNNClassifier`](../../api_docs/python/contrib.learn.html#DNNClassifier) with
-just one line of code:
+tf.contrib.learn offers a variety of predefined models, called [`Estimator`s]
+(../../api_docs/python/contrib.learn.html#estimators), which you can use "out of
+the box" to run training and evaluation operations on your data. Here, you'll
+configure a Deep Neural Network Classifier model to fit the Iris data. Using
+tf.contrib.learn, you can instantiate your [`DNNClassifier`]
+(../../api_docs/python/contrib.learn.html#DNNClassifier) with just one line of
+code:
 
 ```python
-# Build 3 layer DNN with 10, 20, 10 units respectively. 
-classifier = tf.contrib.learn.DNNClassifier(hidden_units=[10, 20, 10], n_classes=3)
+# Build 3 layer DNN with 10, 20, 10 units respectively.
+classifier = tf.contrib.learn.DNNClassifier(hidden_units=[10, 20, 10],
+                                            n_classes=3,
+                                            model_dir="/tmp/iris_model")
 ```
 
-The code above creates a `DNNClassifier` model with three [hidden layers](http://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw), 
-containing 10, 20, and 10 neurons, respectively (`hidden_units=[10, 20, 10]`), and three target
-classes (`n_classes=3`).
+The code above creates a `DNNClassifier` model with three [hidden layers]
+(http://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw),
+containing 10, 20, and 10 neurons, respectively (`hidden_units=[10, 20, 10]`),
+and three target classes (`n_classes=3`). Model data will be stored in
+`/tmp/iris_model`.
 
+## Fit the DNNClassifier to the Iris Training Data {#fit-dnnclassifier}
 
-## Fit the DNNClassifier to the Iris Training Data
-
-Now that you've configured your DNN `classifier` model, you can fit it to the Iris training data
-using the [`fit`](../../api_docs/python/contrib.learn.html#BaseEstimator.fit) 
-method. Pass as arguments your feature data (`x_train`), target values
-(`y_train`), and the number of steps to train (here, 200):
+Now that you've configured your DNN `classifier` model, you can fit it to the
+Iris training data using the [`fit`]
+(../../api_docs/python/contrib.learn.html#BaseEstimator.fit) method. Pass as
+arguments your feature data (`training_set.data`), target values (`training_set.target`), and the
+number of steps to train (here, 2000):
 
 ```python
 # Fit model
-classifier.fit(x=x_train, y=y_train, steps=200)
+classifier.fit(x=training_set.data, y=training_set.target, steps=2000)
 ```
 
-<!-- Style the below (up to the next section) as an aside (note?) -->
-
-<!-- Pretty sure the following is correct, but maybe a SWE could verify? -->
 The state of the model is preserved in the `classifier`, which means you can train iteratively if
 you like. For example, the above is equivalent to the following:
 
 ```python
-classifier.fit(x=x_train, y=y_train, steps=100)
-classifier.fit(x=x_train, y=y_train, steps=100)
+classifier.fit(x=training_set.data, y=training_set.target, steps=1000)
+classifier.fit(x=training_set.data, y=training_set.target, steps=1000)
 ```
 
-<!-- TODO: When tutorial exists for monitoring, link to it here -->
 However, if you're looking to track the model while it trains, you'll likely
-want to instead use a TensorFlow [`monitor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/monitors.py)
-to perform logging operations.
+want to instead use a TensorFlow [`monitor`]
+(https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/monitors.py)
+to perform logging operations. See the tutorial [&ldquo;Logging and Monitoring
+Basics with tf.contrib.learn&rdquo;](../monitors/index.md) for more on this
+topic.
 
-## Evaluate Model Accuracy
+## Evaluate Model Accuracy {#evaluate-accuracy}
 
 You've fit your `DNNClassifier` model on the Iris training data; now, you can
 check its accuracy on the Iris test data using the [`evaluate`
 ](../../api_docs/python/contrib.learn.html#BaseEstimator.evaluate) method.
 Like `fit`, `evaluate` takes feature data and target values as
 arguments, and returns a `dict` with the evaluation results. The following
-code passes the Iris test data&mdash;`x_test` and `y_test`&mdash;to `evaluate`
+code passes the Iris test data&mdash;`test_set.data` and `test_set.target`&mdash;to `evaluate`
 and prints the `accuracy` from the results:
 
 ```python
-accuracy_score = classifier.evaluate(x=x_test, y=y_test)["accuracy"]
+accuracy_score = classifier.evaluate(x=test_set.data, y=test_set.target)["accuracy"]
 print('Accuracy: {0:f}'.format(accuracy_score))
 ```
 
 Run the full script, and check the accuracy results. You should get:
 
 ```
-Accuracy: 0.933333
+Accuracy: 0.966667
 ```
 
 Not bad for a relatively small data set!
@@ -214,7 +238,7 @@ say you have these two new flower samples:
 Sepal Length | Sepal Width | Petal Length | Petal Width
 :----------- | :---------- | :----------- | :----------
 6.4          | 3.2         | 4.5          | 1.5
-5.8          | 3.1         | 5.0          | 1.7        
+5.8          | 3.1         | 5.0          | 1.7
 
 You can predict their species with the following code:
 
@@ -223,7 +247,7 @@ You can predict their species with the following code:
 new_samples = np.array(
     [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=float)
 y = classifier.predict(new_samples)
-print ('Predictions: {}'.format(str(y)))
+print('Predictions: {}'.format(str(y)))
 ```
 
 The `predict()` method returns an array of predictions, one for each sample:
diff --git a/tensorflow/models/rnn/ptb/ptb_word_lm.py b/tensorflow/models/rnn/ptb/ptb_word_lm.py
index 5fea073820a..a8b54a3e9f3 100644
--- a/tensorflow/models/rnn/ptb/ptb_word_lm.py
+++ b/tensorflow/models/rnn/ptb/ptb_word_lm.py
@@ -148,11 +148,15 @@ class PTBModel(object):
     tvars = tf.trainable_variables()
     grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                       config.max_grad_norm)
-    optimizer = tf.train.GradientDescentOptimizer(self.lr)
+    optimizer = tf.train.GradientDescentOptimizer(self._lr)
     self._train_op = optimizer.apply_gradients(zip(grads, tvars))
 
+    self._new_lr = tf.placeholder(
+        tf.float32, shape=[], name="new_learning_rate")
+    self._lr_update = tf.assign(self._lr, self._new_lr)
+
   def assign_lr(self, session, lr_value):
-    session.run(tf.assign(self.lr, lr_value))
+    session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
 
   @property
   def input_data(self):
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index dcc6052a3cb..5e2621cca8f 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -157,10 +157,8 @@ py_test(
 
 cc_library(
     name = "python_op_gen",
-    srcs = [
-        "framework/python_op_gen.cc",
-        "framework/python_op_gen.h",
-    ],
+    srcs = ["framework/python_op_gen.cc"],
+    hdrs = ["framework/python_op_gen.h"],
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:framework",
@@ -1059,6 +1057,27 @@ cuda_py_tests(
     ],
 )
 
+py_library(
+    name = "net_lib",
+    testonly = 1,
+    srcs = ["util/net_lib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pywrap_tensorflow",
+    ],
+)
+
+py_tests(
+    name = "net_lib_test",
+    size = "small",
+    srcs = [
+        "util/net_lib_test.py",
+    ],
+    additional_deps = [
+        ":net_lib",
+    ],
+)
+
 tf_cuda_library(
     name = "tf_session_helper",
     srcs = ["client/tf_session_helper.cc"],
@@ -1067,6 +1086,8 @@ tf_cuda_library(
         ":construction_fails_op",
         ":numpy_lib",
         ":test_ops_kernels",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:tf_status_helper",
         "//tensorflow/core",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:direct_session",
@@ -1083,6 +1104,7 @@ tf_py_wrap_cc(
     swig_includes = [
         "client/device_lib.i",
         "client/events_writer.i",
+        "client/net_lib.i",
         "client/quantize_training.i",
         "client/tf_session.i",
         "framework/python_op_gen.i",
@@ -1106,6 +1128,9 @@ tf_py_wrap_cc(
         ":tf_session_helper",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_session",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//util/python:python_headers",
@@ -1145,6 +1170,14 @@ py_test(
     ],
 )
 
+cuda_py_test(
+    name = "localhost_cluster_performance_test",
+    size = "medium",
+    srcs = [
+        "training/localhost_cluster_performance_test.py",
+    ],
+)
+
 py_library(
     name = "timeline",
     srcs = ["client/timeline.py"],
@@ -1179,6 +1212,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "session_debug_test",
+    size = "small",
+    srcs = ["debug/session_debug_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework",
+        ":framework_test_lib",
+        ":session",
+    ],
+)
+
 cuda_py_test(
     name = "timeline_test",
     size = "small",
@@ -1231,7 +1276,6 @@ cuda_py_test(
     name = "special_math_ops_test",
     size = "small",
     srcs = ["ops/special_math_ops_test.py"],
-    tags = ["notsan"],
 )
 
 cuda_py_tests(
@@ -1254,7 +1298,6 @@ cuda_py_tests(
         "//tensorflow/core:image_testdata",
     ],
     shard_count = 5,
-    tags = ["notsan"],
 )
 
 cuda_py_tests(
@@ -1267,7 +1310,8 @@ cuda_py_tests(
             "training/server_lib_test.py",
             "training/session_manager_test.py",
             "training/supervisor_test.py",
-            "training/saver_test.py",
+            "training/saver_large_variable_test.py",
+            "training/localhost_cluster_performance_test.py",
         ],
     ),
     additional_deps = [
@@ -1275,14 +1319,15 @@ cuda_py_tests(
     ],
 )
 
-cuda_py_test(
-    name = "saver_test",
+py_test(
+    name = "saver_large_variable_test",
     size = "small",
-    srcs = ["training/saver_test.py"],
-    additional_deps = [
-        ":training",
-    ],
+    srcs = ["training/saver_large_variable_test.py"],
+    srcs_version = "PY2AND3",
     tags = ["notsan"],  # http://b/30379628
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
 )
 
 cuda_py_test(
@@ -1313,6 +1358,7 @@ py_tests(
         ["training/input_test.py"],
     ),
     additional_deps = [
+        "//tensorflow:tensorflow_py",
         ":training",
     ],
 )
diff --git a/tensorflow/python/client/net_lib.i b/tensorflow/python/client/net_lib.i
new file mode 100644
index 00000000000..333e2abbc59
--- /dev/null
+++ b/tensorflow/python/client/net_lib.i
@@ -0,0 +1,30 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+#include "tensorflow/core/platform/net.h"
+%}
+
+%ignoreall
+
+%unignore tensorflow;
+%unignore tensorflow::internal;
+%unignore tensorflow::internal::PickUnusedPortOrDie;
+
+%include "tensorflow/core/platform/net.h"
+
+%unignoreall
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index dbcbb25469a..0c2edcb2279 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -23,7 +23,6 @@ import re
 import threading
 
 import numpy as np
-import six
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_session
@@ -71,30 +70,375 @@ def _get_feeds_for_indexed_slices(feed, feed_val):
                   [feed.values, feed.indices, feed.dense_shape], feed_val))
 
 
-def _unflatten_fetches(fetches, flat_values):
-  """Creates a dictionary mapping fetched keys to values.
+# List of extensions supported to convert run arguments into actual fetches and
+# feeds.
+#
+# Each element in the list is a tuple of (Type, fetch_fn, feed_fn1, feed_fn2),
+# where the function signatures are:
+#   fetch_fn : Type -> (list of Tensors,
+#                       lambda: list of fetched np.ndarray -> TypeVal)
+#   feed_fn1 : Type, TypeVal -> list of (Tensor, value)
+#   feed_fn2 : Type -> list of Tensors
+#
+# `fetch_fn` describes how to expand fetch into its
+# component Tensors and how to contract the fetched results back into
+# a single return value.
+#
+# Each feed function describes how to unpack a single fed value and map it to
+# feeds of one or more tensors and their corresponding values: `feed_fn1` is
+# used to feed a run, `feed_fn2` to set up a partial run.
+#
+# TODO(touts): We could reimplement these as specialized _FeedMapper
+# implementations after we refactor the feed handling code to use them.
+#
+# Eventually, this registration could be opened up to support custom Tensor
+# expansions.
+# pylint: disable=g-long-lambda
+_REGISTERED_EXPANSIONS = [
+    # SparseTensors are fetched as SparseTensorValues. They can be fed
+    # SparseTensorValues or normal tuples.
+    (ops.SparseTensor,
+     lambda fetch: (
+         [fetch.indices, fetch.values, fetch.shape],
+         lambda fetched_vals: ops.SparseTensorValue(*fetched_vals)),
+     lambda feed, feed_val: list(zip(
+         [feed.indices, feed.values, feed.shape], feed_val)),
+     lambda feed: [feed.indices, feed.values, feed.shape]),
+    # IndexedSlices are fetched as IndexedSlicesValues. They can be fed
+    # IndexedSlicesValues or normal tuples.
+    (ops.IndexedSlices,
+     lambda fetch: (
+         [fetch.values, fetch.indices] if fetch.dense_shape is None
+         else [fetch.values, fetch.indices, fetch.dense_shape],
+         _get_indexed_slices_value_from_fetches),
+     _get_feeds_for_indexed_slices,
+     lambda feed: [feed.values, feed.indices] if feed.dense_shape is None
+                  else [feed.values, feed.indices, feed.dense_shape]),
+    # The default catches all other types and performs no expansions.
+    (object,
+     lambda fetch: ([fetch], lambda fetched_vals: fetched_vals[0]),
+     lambda feed, feed_val: [(feed, feed_val)],
+     lambda feed: [feed])]
+# pylint: enable=g-long-lambda
+
+
+class _FetchMapper(object):
+  """Definition of the interface provided by fetch mappers.
+
+  Fetch mappers are utility classes used by the _FetchHandler to handle
+  arbitrary structures for the `fetch` argument to `Session.run()`.
+
+  The `fetch` argument can be of various shapes: single tensor or op, list of
+  fetches, tuple of fetches, namedtuple of fetches, or dict of fetches.  The
+  structures can be arbitrarily nested.
+
+  The low level run() API only wants a list of tensor or op names.  The various
+  `_FetchMapper` subclasses below take care of handling the different shapes:
+  uniquifying the fetches, and constructing results with the original shape.
+  """
+
+  def unique_fetches(self):
+    """Return the list of unique tensors or ops needed by this fetch mapper.
+
+    Returns:
+      A list of tensors or ops.
+    """
+    raise NotImplementedError('Must be implemented by subclasses')
+
+  def build_results(self, values):
+    """Build results that match the original shape of the fetch.
+
+    Args:
+      values: List of values returned by run(). The values correspond
+        exactly to the list tensors or ops returned by unique_fetches().
+
+    Returns:
+      A struct of the same shape as the original fetch object handled by
+      this fetch mapper.  In the returned struct, the original fetches are
+      replaced by their fetched values.
+    """
+    raise NotImplementedError('Must be implemented by subclasses')
+
+  @staticmethod
+  def for_fetch(fetch):
+    """Creates fetch mapper that handles the structure of `fetch`.
+
+    The default graph must be the one from which we want to fetch values when
+    this function is called.
+
+    Args:
+      fetch: An arbitrary fetch structure: singleton, list, tuple,
+        namedtuple, or dict.
+
+    Returns:
+      An instance of a subclass of `_FetchMapper` that handles the shape.
+    """
+    if fetch is None:
+      raise TypeError('Fetch argument %r has invalid type %r' %
+                      (fetch, type(fetch)))
+    elif isinstance(fetch, (list, tuple)):
+      # NOTE(touts): This is also the code path for namedtuples.
+      return _ListFetchMapper(fetch)
+    elif isinstance(fetch, dict):
+      return _DictFetchMapper(fetch)
+    else:
+      # Look for a handler in the registered expansions.
+      for tensor_type, fetch_fn, _, _ in _REGISTERED_EXPANSIONS:
+        if isinstance(fetch, tensor_type):
+          fetches, contraction_fn = fetch_fn(fetch)
+          return _ElementFetchMapper(fetches, contraction_fn)
+    # Did not find anything.
+    raise TypeError('Fetch argument %r has invalid type %r' %
+                    (fetch, type(fetch)))
+
+
+class _ElementFetchMapper(_FetchMapper):
+  """Fetch mapper for singleton tensors and ops."""
+
+  def __init__(self, fetches, contraction_fn):
+    """Creates an _ElementFetchMapper.
+
+    This is the fetch mapper used for leaves in the fetch struct.  Because of
+    the expansions mechanism, a leaf can actually fetch more than one tensor.
+
+    Also note that the fetches here can be just strings (tensor or op names) or
+    any other object that the graph knows how to convert to a tensor, such as a
+    Variable.  So we have to run each fetch through `as_graph_element()` to get
+    the corresponding tensor or op.
+
+    Args:
+      fetches: List of objects, as returned by a fetch_fn defined
+        in _REGISTERED_EXPANSIONS.
+      contraction_fn: Callable as returned by a fetch_fn.
+    """
+    self._unique_fetches = []
+    for fetch in fetches:
+      try:
+        self._unique_fetches.append(ops.get_default_graph().as_graph_element(
+            fetch, allow_tensor=True, allow_operation=True))
+      except TypeError as e:
+        raise TypeError('Fetch argument %r has invalid type %r, '
+                        'must be a string or Tensor. (%s)'
+                        % (fetch, type(fetch), str(e)))
+      except ValueError as e:
+        raise ValueError('Fetch argument %r cannot be interpreted as a '
+                         'Tensor. (%s)' % (fetch, str(e)))
+      except KeyError as e:
+        raise ValueError('Fetch argument %r cannot be interpreted as a '
+                         'Tensor. (%s)' % (fetch, str(e)))
+    self._contraction_fn = contraction_fn
+
+  def unique_fetches(self):
+    return self._unique_fetches
+
+  def build_results(self, values):
+    if not values:
+      # 'Operation' case
+      return None
+    else:
+      return self._contraction_fn(values)
+
+
+def _uniquify_fetches(fetch_mappers):
+  """Uniquifies fetches from a list of fetch_mappers.
+
+  This is a utility function used by _ListFetchMapper and _DictFetchMapper.  It
+  gathers all the unique fetches from a list of mappers and builds a list
+  containing all of them but without duplicates (unique_fetches).
+
+  It also returns a 2-D list of integers (values_indices) indicating at which
+  index in unique_fetches the fetches of the mappers are located.
+
+  This list is as follows:
+    values_indices[mapper_index][mapper_fetch_index] = unique_fetches_index
 
   Args:
-    fetches: A dictionary of either graph elements or lists/tuples
-      of graph elements.
-    flat_values: A flat list of fetched values.
+    fetch_mappers: list of fetch mappers.
 
   Returns:
-    A dictionary with the same keys as `fetches`, mapping to the fetched value
-    (or list of values) in `flat_values`.
+    A list of fetches.
+    A 2-D list of integers.
   """
-  used = 0
-  ret = {}
-  for key, fetch in six.iteritems(fetches):
-    if isinstance(fetch, (list, tuple)):
-      flat_fetch_values = nest.flatten(fetch)
-      start, used = used, used + len(flat_fetch_values)
-      ret[key] = nest.pack_sequence_as(fetch, flat_values[start:used])
+  unique_fetches = []
+  value_indices = []
+  seen_fetches = {}
+  for m in fetch_mappers:
+    m_value_indices = []
+    for f in m.unique_fetches():
+      j = seen_fetches.get(f)
+      if j is None:
+        j = len(seen_fetches)
+        seen_fetches[f] = j
+        unique_fetches.append(f)
+      m_value_indices.append(j)
+    value_indices.append(m_value_indices)
+  return unique_fetches, value_indices
+
+
+class _ListFetchMapper(_FetchMapper):
+  """Fetch mapper for lists, tuples, and namedtuples."""
+
+  def __init__(self, fetches):
+    """Creates a _ListFetchMapper.
+
+    Args:
+      fetches: List, tuple, or namedtuple of fetches.
+    """
+    self._fetch_type = type(fetches)
+    self._mappers = [_FetchMapper.for_fetch(fetch) for fetch in fetches]
+    self._unique_fetches, self._value_indices = _uniquify_fetches(self._mappers)
+
+  def unique_fetches(self):
+    return self._unique_fetches
+
+  def build_results(self, values):
+    # Create the list of results for each mapper.
+    results = []
+    for m, vi in zip(self._mappers, self._value_indices):
+      results.append(m.build_results([values[j] for j in vi]))
+    # Return a value of the original type of the fetches.
+    if self._fetch_type == list:
+      return results
+    elif self._fetch_type == tuple:
+      return tuple(results)
     else:
-      ret[key] = flat_values[used]
-      used += 1
-  assert used == len(flat_values)
-  return ret
+      # This is the code path for namedtuple.
+      return self._fetch_type(*results)
+
+
+class _DictFetchMapper(_FetchMapper):
+  """Fetch mapper for dicts."""
+
+  def __init__(self, fetches):
+    """Creates a _DictFetchMapper.
+
+    Args:
+      fetches: Dict of fetches.
+    """
+    self._keys = fetches.keys()
+    self._mappers = [_FetchMapper.for_fetch(fetch)
+                     for fetch in fetches.values()]
+    self._unique_fetches, self._value_indices = _uniquify_fetches(self._mappers)
+
+  def unique_fetches(self):
+    return self._unique_fetches
+
+  def build_results(self, values):
+    results = {}
+    for k, m, vi in zip(self._keys, self._mappers, self._value_indices):
+      results[k] = m.build_results([values[j] for j in vi])
+    return results
+
+
+class _FetchHandler(object):
+  """Handler for structured fetches.
+
+  Given a graph, a user-provided structure for fetches, and a feed dict, this
+  class takes care of generating a list of tensor names to fetch and op names
+  to run for a low level `run()` call.
+
+  Given the results of the low level run call, this class can also rebuild a
+  result structure matching the user-provided structure for fetches, but
+  containing the corresponding results.
+  """
+  # TODO(touts): Make this class also take care of destructuring the feed
+  # dict instead of doing it in the callers.
+
+  def __init__(self, graph, fetches, feeds):
+    """Creates a fetch handler.
+
+    Args:
+      graph: Graph of the fetches.   Used to check for fetchability
+        and to convert all fetches to tensors or ops as needed.
+      fetches: An arbitrary fetch structure: singleton, list, tuple,
+        namedtuple, or dict.
+      feeds: A feed dict where keys are fully resolved tensor names.
+    """
+    with graph.as_default():
+      self._fetch_mapper = _FetchMapper.for_fetch(fetches)
+    self._fetches = []
+    self._targets = []
+    self._feeds = feeds
+    self._ops = []
+    self._fetch_handles = {}
+    for fetch in self._fetch_mapper.unique_fetches():
+      fetch_name = compat.as_bytes(fetch.name)
+      if isinstance(fetch, ops.Operation):
+        self._assert_fetchable(graph, fetch)
+        self._targets.append(fetch_name)
+        self._ops.append(True)
+      else:
+        self._assert_fetchable(graph, fetch.op)
+        self._fetches.append(fetch_name)
+        self._ops.append(False)
+      # Remember the fetch if it is for a tensor handle.
+      if isinstance(fetch, ops.Tensor) and fetch.op.type == 'GetSessionHandle':
+        self._fetch_handles[fetch_name] = fetch.op.inputs[0].dtype
+    self._final_fetches = [x for x in self._fetches if x not in feeds]
+
+  def _assert_fetchable(self, graph, op):
+    if not graph.is_fetchable(op):
+      raise ValueError(
+          'Operation %r has been marked as not fetchable.' % op.name)
+
+  def fetches(self):
+    """Return the unique names of tensors to fetch.
+
+    Returns:
+      A list of strings.
+    """
+    return self._final_fetches
+
+  def targets(self):
+    """Return the unique names of ops to run.
+
+    Returns:
+      A list of strings.
+    """
+    return self._targets
+
+  def build_results(self, session, tensor_values):
+    """Build results matching the original fetch shape.
+
+    `tensor_values` must be a list of the same length as
+    the one returned by `fetches()`, and holding the requested
+    fetch values.
+
+    This method builds a struct with the same shape as the original `fetches`
+    passed to the constructor, in which the fetches are replaced by their
+    fetched value.
+
+    Args:
+      session: The enclosing session.  Used for tensor handles.
+      tensor_values: List of values matching the list returned
+        by fetches().
+
+    Returns:
+      A structure of the same shape as the original `fetches` argument but
+        containing tensors or None (for fetched ops).
+    """
+    full_values = []
+    assert len(self._final_fetches) == len(tensor_values)
+    i = 0
+    j = 0
+    for is_op in self._ops:
+      if is_op:
+        full_values.append(None)
+      else:
+        # If the fetch was in the feeds, use the fed value, otherwise
+        # use the returned value.
+        value = self._feeds.get(self._fetches[i])
+        if value is None:
+          value = tensor_values[j]
+          j += 1
+        dtype = self._fetch_handles.get(self._fetches[i])
+        if dtype:
+          full_values.append(session_ops.TensorHandle(value, dtype, session))
+        else:
+          full_values.append(value)
+        i += 1
+    assert j == len(tensor_values)
+    return self._fetch_mapper.build_results(full_values)
 
 
 class BaseSession(SessionInterface):
@@ -252,72 +596,56 @@ class BaseSession(SessionInterface):
     """
     return ops.default_session(self)
 
-  # Eventually, this registration could be opened up to support custom
-  # Tensor expansions. Expects tuples of (Type, fetch_fn, feed_fn1, feed_fn2),
-  # where the signatures are:
-  #   fetch_fn : Type -> (list of Tensors,
-  #                       lambda: list of fetched np.ndarray -> TypeVal)
-  #   feed_fn1 : Type, TypeVal -> list of (Tensor, value)
-  #   feed_fn2 : Type -> list of Tensors
-  # Conceptually, fetch_fn describes how to expand fetch into its
-  # component Tensors and how to contracting the fetched results back into
-  # a single return value. feed_fn describes how to unpack a single fed
-  # value and map it to feeds of a Tensor and its corresponding value.
-  # pylint: disable=g-long-lambda
-  _REGISTERED_EXPANSIONS = [
-      # SparseTensors are fetched as SparseTensorValues. They can be fed
-      # SparseTensorValues or normal tuples.
-      (ops.SparseTensor,
-       lambda fetch: (
-           [fetch.indices, fetch.values, fetch.shape],
-           lambda fetched_vals: ops.SparseTensorValue(*fetched_vals)),
-       lambda feed, feed_val: list(zip(
-           [feed.indices, feed.values, feed.shape], feed_val)),
-       lambda feed: [feed.indices, feed.values, feed.shape]),
-      # IndexedSlices are fetched as IndexedSlicesValues. They can be fed
-      # IndexedSlicesValues or normal tuples.
-      (ops.IndexedSlices,
-       lambda fetch: (
-           [fetch.values, fetch.indices] if fetch.dense_shape is None
-           else [fetch.values, fetch.indices, fetch.dense_shape],
-           _get_indexed_slices_value_from_fetches),
-       _get_feeds_for_indexed_slices,
-       lambda feed: [feed.values, feed.indices] if feed.dense_shape is None
-                    else [feed.values, feed.indices, feed.dense_shape]),
-      # The default catches all types and performs no expansions.
-      (object,
-       lambda fetch: ([fetch], lambda fetched_vals: fetched_vals[0]),
-       lambda feed, feed_val: [(feed, feed_val)],
-       lambda feed: [feed])]
-  # pylint: enable=g-long-lambda
-
   def run(self, fetches, feed_dict=None, options=None, run_metadata=None):
-    """Runs the operations and evaluates the tensors in `fetches`.
+    """Runs operations and evaluates tensors in `fetches`.
 
     This method runs one "step" of TensorFlow computation, by
     running the necessary graph fragment to execute every `Operation`
     and evaluate every `Tensor` in `fetches`, substituting the values in
     `feed_dict` for the corresponding input values.
 
-    The `fetches` argument may be a single graph element, an arbitrarily nested
-    list of graph elements, or a dictionary whose values are the above. The type
-    of `fetches` determines the return value of this method. A graph element can
-    be one of the following types:
+    The `fetches` argument may be a single graph element, or an arbitrarily
+    nested list, tuple, namedtuple, or dict containing graph elements at its
+    leaves.  A graph element can be one of the following types:
 
-    * If an element of `fetches` is an
-      [`Operation`](../../api_docs/python/framework.md#Operation), the
-      corresponding fetched value will be `None`.
-    * If an element of `fetches` is a
-      [`Tensor`](../../api_docs/python/framework.md#Tensor), the corresponding
-      fetched value will be a numpy ndarray containing the value of that tensor.
-    * If an element of `fetches` is a
-      [`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor),
-      the corresponding fetched value will be a
+    * An [`Operation`](../../api_docs/python/framework.md#Operation).
+      The corresponding fetched value will be `None`.
+    * A [`Tensor`](../../api_docs/python/framework.md#Tensor).
+      The corresponding fetched value will be a numpy ndarray containing the
+      value of that tensor.
+    * A [`SparseTensor`](../../api_docs/python/sparse_ops.md#SparseTensor).
+      The corresponding fetched value will be a
       [`SparseTensorValue`](../../api_docs/python/sparse_ops.md#SparseTensorValue)
       containing the value of that sparse tensor.
-    * If an element of `fetches` is produced by a `get_tensor_handle` op,
-      the corresponding fetched value will be a numpy ndarray containing the
-      handle of that tensor.
+    * A `get_tensor_handle` op.  The corresponding fetched value will be a
+      numpy ndarray containing the handle of that tensor.
+    * A `string` which is the name of a tensor or operation in the graph.
+
+    The value returned by `run()` has the same shape as the `fetches` argument,
+    where the leaves are replaced by the corresponding values returned by
+    TensorFlow.
+
+    Example:
+
+    ```python
+       a = tf.constant([10, 20])
+       b = tf.constant([1.0, 2.0])
+       # 'fetches' can be a singleton
+       v = session.run(a)
+       # v is the numpy array [10, 20]
+       # 'fetches' can be a list.
+       v = session.run([a, b])
+       # v a Python list with 2 numpy arrays: the numpy array [10, 20] and the
+       # 1-D array [1.0, 2.0]
+       # 'fetches' can be arbitrary lists, tuples, namedtuple, dicts:
+       MyData = collections.namedtuple('MyData', ['a', 'b'])
+       v = session.run({'k1': MyData(a, b), 'k2': [b, a]})
+       # v is a dict with
+       # v['k1'] is a MyData namedtuple with 'a' the numpy array [10, 20] and
+       # 'b' the numpy array [1.0, 2.0]
+       # v['k2'] is a list with the numpy array [1.0, 2.0] and the numpy array
+       # [10, 20].
+    ```
 
     The optional `feed_dict` argument allows the caller to override
     the value of tensors in the graph. Each key in `feed_dict` can be
@@ -433,6 +761,7 @@ class BaseSession(SessionInterface):
     Raises:
       tf.errors.OpError: Or one of its subclasses on error.
     """
+    # TODO(touts): Support feeding and fetching the same tensor.
     return self._run(handle, fetches, feed_dict, None, None)
 
   def partial_run_setup(self, fetches, feeds=None):
@@ -457,7 +786,7 @@ class BaseSession(SessionInterface):
       tf.errors.OpError: Or one of its subclasses if a TensorFlow error happens.
     """
     def _feed_fn(feed):
-      for tensor_type, _, _, feed_fn in BaseSession._REGISTERED_EXPANSIONS:
+      for tensor_type, _, _, feed_fn in _REGISTERED_EXPANSIONS:
         if isinstance(feed, tensor_type):
           return feed_fn(feed)
       raise TypeError('Feed argument %r has invalid type %r'
@@ -470,9 +799,6 @@ class BaseSession(SessionInterface):
       raise RuntimeError('The Session graph is empty.  Add operations to the '
                          'graph before calling run().')
 
-    # Validate and process fetches.
-    unique_fetches, target_list, _, _ = self._process_fetches(fetches)
-
     # Create request.
     feed_list = []
 
@@ -492,6 +818,10 @@ class BaseSession(SessionInterface):
           e.args = (e.message,)
           raise e
 
+    # Validate and process fetches.
+    # TODO(touts): Support feeding and fetching the same tensor.
+    fetch_handler = _FetchHandler(self._graph, fetches, {})
+
     # Set up a graph with feeds and fetches for partial run.
     def _setup_fn(session, feed_list, fetch_list, target_list):
       self._extend_graph()
@@ -499,71 +829,13 @@ class BaseSession(SessionInterface):
         return tf_session.TF_PRunSetup(session, feed_list, fetch_list,
                                        target_list, status)
 
-    return self._do_call(_setup_fn, self._session, feed_list, unique_fetches,
-                         target_list)
-
-  def _assert_fetchable(self, op):
-    if not self.graph.is_fetchable(op):
-      raise ValueError(
-          'Operation %r has been marked as not fetchable.' % op.name)
-
-  def _process_fetches(self, fetches):
-    """Validate and process fetches."""
-    def _fetch_fn(fetch):
-      for tensor_type, fetch_fn, _, _ in BaseSession._REGISTERED_EXPANSIONS:
-        if isinstance(fetch, tensor_type):
-          return fetch_fn(fetch)
-      raise TypeError('Fetch argument %r has invalid type %r'
-                      % (fetch, type(fetch)))
-
-    # Validate and process fetches.
-    is_list_fetch = isinstance(fetches, (list, tuple))
-    if not is_list_fetch:
-      fetches = [fetches]
-
-    unique_fetch_targets = set()
-    unique_fetch_handles = {}
-    target_list = []
-
-    fetch_info = []
-    for fetch in fetches:
-      subfetches, fetch_contraction_fn = _fetch_fn(fetch)
-      subfetch_names = []
-      for subfetch in subfetches:
-        try:
-          fetch_t = self.graph.as_graph_element(subfetch, allow_tensor=True,
-                                                allow_operation=True)
-          fetch_name = compat.as_bytes(fetch_t.name)
-          if isinstance(fetch_t, ops.Operation):
-            self._assert_fetchable(fetch_t)
-            target_list.append(fetch_name)
-          else:
-            self._assert_fetchable(fetch_t.op)
-            subfetch_names.append(fetch_name)
-          # Remember the fetch if it is for a tensor handle.
-          if (isinstance(fetch_t, ops.Tensor) and
-              fetch_t.op.type == 'GetSessionHandle'):
-            unique_fetch_handles[fetch_name] = fetch_t.op.inputs[0].dtype
-        except TypeError as e:
-          raise TypeError('Fetch argument %r of %r has invalid type %r, '
-                          'must be a string or Tensor. (%s)'
-                          % (subfetch, fetch, type(subfetch), str(e)))
-        except ValueError as e:
-          raise ValueError('Fetch argument %r of %r cannot be interpreted as a '
-                           'Tensor. (%s)' % (subfetch, fetch, str(e)))
-        except KeyError as e:
-          raise ValueError('Fetch argument %r of %r cannot be interpreted as a '
-                           'Tensor. (%s)' % (subfetch, fetch, str(e)))
-      unique_fetch_targets.update(subfetch_names)
-      fetch_info.append((subfetch_names, fetch_contraction_fn))
-
-    unique_fetch_targets = list(unique_fetch_targets)
-    return unique_fetch_targets, target_list, fetch_info, unique_fetch_handles
+    return self._do_call(_setup_fn, self._session, feed_list,
+                         fetch_handler.fetches(), fetch_handler.targets())
 
   def _run(self, handle, fetches, feed_dict, options, run_metadata):
-    """Perform either run or partial_run, depending the exitence of `handle`."""
+    """Perform either run or partial_run, depending the presence of `handle`."""
     def _feed_fn(feed, feed_val):
-      for tensor_type, _, feed_fn, _ in BaseSession._REGISTERED_EXPANSIONS:
+      for tensor_type, _, feed_fn, _ in _REGISTERED_EXPANSIONS:
         if isinstance(feed, tensor_type):
           return feed_fn(feed, feed_val)
       raise TypeError('Feed argument %r has invalid type %r'
@@ -576,31 +848,6 @@ class BaseSession(SessionInterface):
       raise RuntimeError('The Session graph is empty.  Add operations to the '
                          'graph before calling run().')
 
-    # Flatten/unflatten fetched values.
-    if isinstance(fetches, (list, tuple)):
-      # fetches is already a list or tuple; nothing to do.
-      orig_fetches, fetches = fetches, nest.flatten(fetches)
-      unflatten = lambda fetched: nest.pack_sequence_as(orig_fetches, fetched)
-    elif isinstance(fetches, dict):
-      # fetches is a dictionary; flatten the values and map fetched
-      # values back into to a dictionary.
-      # nest.flatten does not accept iterators, next line is for python3
-      # compatibility.
-      fetches_values = list(fetches.values())
-      orig_fetches, fetches = fetches, nest.flatten(fetches_values)
-      unflatten = lambda fetched: _unflatten_fetches(orig_fetches, fetched)
-    else:
-      # fetches is a singleton.
-      fetches = [fetches]
-      unflatten = lambda fetched: fetched[0]
-
-    # Validate and process fetches.
-    processed_fetches = self._process_fetches(fetches)
-    unique_fetches = processed_fetches[0]
-    target_list = processed_fetches[1]
-    fetch_info = processed_fetches[2]
-    unique_handles = processed_fetches[3]
-
     # Create request.
     feed_dict_string = {}
     feed_map = {}
@@ -644,6 +891,9 @@ class BaseSession(SessionInterface):
           feed_dict_string[subfeed_name] = np_val
           feed_map[subfeed_name] = (subfeed_t, subfeed_val)
 
+    # Create a fetch handler to take care of the structure of fetches.
+    fetch_handler = _FetchHandler(self._graph, fetches, feed_dict_string)
+
     # Run request and get response.
     # We need to keep the movers alive for the following _do_run().
     # These movers are no longer needed when _do_run() completes, and
@@ -651,29 +901,14 @@ class BaseSession(SessionInterface):
     # TODO(yuanbyu, keveman): Revisit whether we should just treat feeding
     # of a handle from a different device as an error.
     movers = self._update_with_movers(feed_dict_string, feed_map)
-    results = self._do_run(handle, target_list, unique_fetches,
-                           feed_dict_string, options, run_metadata)
-
-    # User may have fetched the same tensor multiple times, but we
-    # only fetch them from the runtime once.  Furthermore, they may
-    # be wrapped as a tuple of tensors.  Here we map the results back
-    # to what the client asked for.
-    # TODO(yuanbyu): Use the contraction_fn in _REGISTERED_EXPANSIONS.
-    fetched_results = {}
-    for fetch, result in zip(unique_fetches, results):
-      dtype = unique_handles.get(fetch)
-      if dtype:
-        result = session_ops.TensorHandle(result, dtype, self)
-      fetched_results[fetch] = result
-    ret = []
-    for fetch_names, fetch_contraction_fn in fetch_info:
-      if fetch_names:
-        fetched_vals = [fetched_results[name] for name in fetch_names]
-        ret.append(fetch_contraction_fn(fetched_vals))
-      else:
-        ret.append(None)
-
-    return unflatten(ret)
+    final_fetches = fetch_handler.fetches()
+    final_targets = fetch_handler.targets()
+    if final_fetches or final_targets:
+      results = self._do_run(handle, final_targets, final_fetches,
+                             feed_dict_string, options, run_metadata)
+    else:
+      results = []
+    return fetch_handler.build_results(self, results)
 
   # Captures the name of a node in an error status.
   _NODEDEF_NAME_RE = re.compile(r'\[\[Node: ([^ ]*?) =')
@@ -1000,8 +1235,7 @@ class InteractiveSession(BaseSession):
 
     Args:
       target: (Optional.) The execution engine to connect to.
-        Defaults to using an in-process engine. At present, no value
-        other than the empty string is supported.
+        Defaults to using an in-process engine.
       graph: (Optional.) The `Graph` to be launched (described above).
       config: (Optional) `ConfigProto` proto used to configure the session.
     """
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 0ee95abd15d..462afc39e0e 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import threading
 import time
 
@@ -180,6 +181,172 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaises(TypeError):
         s.run({'a': a, 'b': None})
 
+  def testFetchSingleton(self):
+    with session.Session() as sess:
+      a = constant_op.constant(42.0)
+      res = sess.run(a)
+      self.assertEqual(42.0, res)
+      res = sess.run(a.op)  # An op, not a tensor.
+      self.assertEqual(None, res)
+
+  def testFetchSingletonByName(self):
+    with session.Session() as sess:
+      a = constant_op.constant(42.0)
+      res = sess.run(a.name)
+      self.assertEqual(42.0, res)
+      res = sess.run(a.op)  # An op, not a tensor.
+      self.assertEqual(None, res)
+
+  def testFetchList(self):
+    with session.Session() as sess:
+      a = constant_op.constant(42.0)
+      b = control_flow_ops.no_op()  # An op, not a tensor.
+      c = constant_op.constant(44.0)
+      v = variables.Variable([54.0])
+      assign = v.assign([63.0])
+      res = sess.run([a, b, c, a.name, assign.op])
+      self.assertTrue(isinstance(res, list))
+      self.assertEqual(42.0, res[0])
+      self.assertEqual(None, res[1])
+      self.assertEqual(44.0, res[2])
+      self.assertEqual(42.0, res[3])
+      self.assertEqual(None, res[4])
+      self.assertEqual(63.0, sess.run(v))
+
+  def testFetchTuple(self):
+    with session.Session() as sess:
+      a = constant_op.constant(42.0)
+      b = control_flow_ops.no_op()  # An op, not a tensor.
+      c = constant_op.constant(44.0)
+      res = sess.run((a, b, c, a.name))
+      self.assertTrue(isinstance(res, tuple))
+      self.assertEqual(42.0, res[0])
+      self.assertEqual(None, res[1])
+      self.assertEqual(44.0, res[2])
+      self.assertEqual(42.0, res[3])
+
+  def testFetchNamedTuple(self):
+    # pylint: disable=invalid-name
+    ABC = collections.namedtuple('ABC', ['a', 'b', 'c'])
+    # pylint: enable=invalid-name
+    with session.Session() as sess:
+      a = constant_op.constant(42.0)
+      b = control_flow_ops.no_op()  # An op, not a tensor.
+      c = constant_op.constant(44.0)
+      res = sess.run(ABC(a, b, c))
+      self.assertTrue(isinstance(res, ABC))
+      self.assertEqual(42.0, res.a)
+      self.assertEqual(None, res.b)
+      self.assertEqual(44.0, res.c)
+
+  def testFetchDict(self):
+    with session.Session() as sess:
+      a = constant_op.constant(42.0)
+      b = control_flow_ops.no_op()  # An op, not a tensor.
+      c = constant_op.constant(44.0)
+      res = sess.run({'a': a, 'b': b, 'c': c})
+      self.assertTrue(isinstance(res, dict))
+      self.assertEqual(42.0, res['a'])
+      self.assertEqual(None, res['b'])
+      self.assertEqual(44.0, res['c'])
+
+  def testFetchNestingOneLevel(self):
+    with session.Session() as sess:
+      # pylint: disable=invalid-name
+      ABC = collections.namedtuple('ABC', ['a', 'b', 'c'])
+      DEFG = collections.namedtuple('DEFG', ['d', 'e', 'f', 'g'])
+      # pylint: enable=invalid-name
+      a_val = 42.0
+      b_val = None
+      c_val = 44.0
+      a = constant_op.constant(a_val)
+      b = control_flow_ops.no_op()  # An op, not a tensor.
+      c = constant_op.constant(c_val)
+      # List of lists, tuples, namedtuple, and  dict
+      res = sess.run([[a, b, c], (a, b, c), ABC(a=a, b=b, c=c),
+                      {'a': a.name, 'c': c, 'b': b}])
+      self.assertTrue(isinstance(res, list))
+      self.assertTrue(isinstance(res[0], list))
+      self.assertEqual(a_val, res[0][0])
+      self.assertEqual(b_val, res[0][1])
+      self.assertEqual(c_val, res[0][2])
+      self.assertTrue(isinstance(res[1], tuple))
+      self.assertEqual(a_val, res[1][0])
+      self.assertEqual(b_val, res[1][1])
+      self.assertEqual(c_val, res[1][2])
+      self.assertTrue(isinstance(res[2], ABC))
+      self.assertEqual(a_val, res[2].a)
+      self.assertEqual(b_val, res[2].b)
+      self.assertEqual(c_val, res[2].c)
+      self.assertTrue(isinstance(res[3], dict))
+      self.assertEqual(a_val, res[3]['a'])
+      self.assertEqual(b_val, res[3]['b'])
+      self.assertEqual(c_val, res[3]['c'])
+      # Tuple of lists, tuples, namedtuple, and  dict
+      res = sess.run(([a, b, c], (a.name, b, c), ABC(a=a, b=b, c=c),
+                      {'a': a, 'c': c, 'b': b}))
+      self.assertTrue(isinstance(res, tuple))
+      self.assertTrue(isinstance(res[0], list))
+      self.assertEqual(a_val, res[0][0])
+      self.assertEqual(b_val, res[0][1])
+      self.assertEqual(c_val, res[0][2])
+      self.assertTrue(isinstance(res[1], tuple))
+      self.assertEqual(a_val, res[1][0])
+      self.assertEqual(b_val, res[1][1])
+      self.assertEqual(c_val, res[1][2])
+      self.assertTrue(isinstance(res[2], ABC))
+      self.assertEqual(a_val, res[2].a)
+      self.assertEqual(b_val, res[2].b)
+      self.assertEqual(c_val, res[2].c)
+      self.assertTrue(isinstance(res[3], dict))
+      self.assertEqual(a_val, res[3]['a'])
+      self.assertEqual(b_val, res[3]['b'])
+      self.assertEqual(c_val, res[3]['c'])
+      # Namedtuple of lists, tuples, namedtuples, and dict
+      res = sess.run(DEFG(d=[a, b, c],
+                          e=(a, b, c),
+                          f=ABC(a=a.name, b=b, c=c),
+                          g={'a': a, 'c': c, 'b': b}))
+      self.assertTrue(isinstance(res, DEFG))
+      self.assertTrue(isinstance(res.d, list))
+      self.assertEqual(a_val, res.d[0])
+      self.assertEqual(b_val, res.d[1])
+      self.assertEqual(c_val, res.d[2])
+      self.assertTrue(isinstance(res.e, tuple))
+      self.assertEqual(a_val, res.e[0])
+      self.assertEqual(b_val, res.e[1])
+      self.assertEqual(c_val, res.e[2])
+      self.assertTrue(isinstance(res.f, ABC))
+      self.assertEqual(a_val, res.f.a)
+      self.assertEqual(b_val, res.f.b)
+      self.assertEqual(c_val, res.f.c)
+      self.assertTrue(isinstance(res.g, dict))
+      self.assertEqual(a_val, res.g['a'])
+      self.assertEqual(b_val, res.g['b'])
+      self.assertEqual(c_val, res.g['c'])
+      # Dict of lists, tuples, namedtuples, and dict
+      res = sess.run({'d': [a, b, c],
+                      'e': (a, b, c),
+                      'f': ABC(a=a, b=b, c=c),
+                      'g': {'a': a.name, 'c': c, 'b': b}})
+      self.assertTrue(isinstance(res, dict))
+      self.assertTrue(isinstance(res['d'], list))
+      self.assertEqual(a_val, res['d'][0])
+      self.assertEqual(b_val, res['d'][1])
+      self.assertEqual(c_val, res['d'][2])
+      self.assertTrue(isinstance(res['e'], tuple))
+      self.assertEqual(a_val, res['e'][0])
+      self.assertEqual(b_val, res['e'][1])
+      self.assertEqual(c_val, res['e'][2])
+      self.assertTrue(isinstance(res['f'], ABC))
+      self.assertEqual(a_val, res['f'].a)
+      self.assertEqual(b_val, res['f'].b)
+      self.assertEqual(c_val, res['f'].c)
+      self.assertTrue(isinstance(res['g'], dict))
+      self.assertEqual(a_val, res['g']['a'])
+      self.assertEqual(b_val, res['g']['b'])
+      self.assertEqual(c_val, res['g']['c'])
+
   def testFetchTensorObject(self):
     with session.Session() as s:
       a = constant_op.constant(1.0, shape=[1, 2])
@@ -338,6 +505,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
+      # Feed with tuple, fetch sp directly
+      sp_out = s.run(sp, {sp: (indices, values, shape)})
+      self.assertAllEqual(sp_out.indices, indices)
+      self.assertAllEqual(sp_out.values, values)
+      self.assertAllEqual(sp_out.shape, shape)
       # Feed with SparseTensorValue
       indices_out, values_out, shape_out = s.run(
           [sp_indices, sp_values, sp_shape],
@@ -350,6 +522,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(sp2_out.indices, indices)
       self.assertAllEqual(sp2_out.values, values)
       self.assertAllEqual(sp2_out.shape, shape)
+      # Feed SparseTensorValue and fetch sp directly.
+      sp_out = s.run(sp, {sp: ops.SparseTensorValue(indices, values, shape)})
+      self.assertAllEqual(sp_out.indices, indices)
+      self.assertAllEqual(sp_out.values, values)
+      self.assertAllEqual(sp_out.shape, shape)
 
   def testFeedSparsePlaceholder(self):
     with session.Session() as s:
@@ -869,7 +1046,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(a2_val, [[1.0, 1.0]])
 
   def testFeedAndFetch(self):
-    with session.Session():
+    with session.Session() as sess:
       for dtype in [dtypes.float16,
                     dtypes.float32,
                     dtypes.float64,
@@ -899,7 +1076,15 @@ class SessionTest(test_util.TensorFlowTestCase):
             np_array = np_array.astype(np_dtype)
 
           self.assertAllEqual(np_array,
-                              out_t.eval(feed_dict={feed_t: np_array}))
+                              sess.run(out_t, feed_dict={feed_t: np_array}))
+          # Check that we can also get the feed back.
+          self.assertAllEqual(np_array,
+                              sess.run(feed_t, feed_dict={feed_t: np_array}))
+          # Also check that we can get both back.
+          out_v, feed_v = sess.run([out_t, feed_t],
+                                   feed_dict={feed_t: np_array})
+          self.assertAllEqual(np_array, out_v)
+          self.assertAllEqual(np_array, feed_v)
 
   def testFeedError(self):
     with session.Session() as sess:
@@ -941,7 +1126,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(c.eval(), c_list)
 
   def testStringFeed(self):
-    with session.Session():
+    with session.Session() as sess:
       for shape in [(32, 4, 128), (37,), (2, 0, 6), (0, 0, 0)]:
         size = 1
         for s in shape:
@@ -950,7 +1135,12 @@ class SessionTest(test_util.TensorFlowTestCase):
                           dtype=np.object).reshape(shape)
         feed_t = array_ops.placeholder(dtype=dtypes.string, shape=shape)
         c = array_ops.identity(feed_t)
-        self.assertAllEqual(c.eval(feed_dict={feed_t: c_list}), c_list)
+        self.assertAllEqual(sess.run(c, feed_dict={feed_t: c_list}), c_list)
+        self.assertAllEqual(sess.run(feed_t, feed_dict={feed_t: c_list}),
+                            c_list)
+        c_v, feed_v = sess.run([c, feed_t], feed_dict={feed_t: c_list})
+        self.assertAllEqual(c_v, c_list)
+        self.assertAllEqual(feed_v, c_list)
 
   def testStringFeedWithNullCharacters(self):
     with session.Session():
@@ -1184,14 +1374,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, 'may not be fed'):
         sess.run(reshaped_tensor, feed_dict={new_shape: [3, 7]})
 
-  def testRunWithNoTargetsIsAnError(self):
-    with session.Session() as sess:
-      _ = constant_op.constant(5.0)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          'Must specify at least one target to fetch or execute.'):
-        sess.run([])
-
   def testInferShapesFalse(self):
     with ops.Graph().as_default(), ops.device('/cpu:0'):
       a = constant_op.constant([[1, 2]])
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 659aa4a748c..87391fff68d 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -213,7 +213,7 @@ tensorflow::ImportNumpy();
       reinterpret_cast<const char*>($1.data), $1.length);
 }
 
-// Include the functions from tensor_c_api.h, except TF_Run.
+// Include the functions from c_api.h, except TF_Run.
 %ignoreall
 %unignore TF_Code;
 %unignore TF_Status;
@@ -238,7 +238,7 @@ tensorflow::ImportNumpy();
 %unignore TF_NewLibrary;
 %unignore TF_LoadLibrary;
 %unignore TF_GetOpList;
-%include "tensorflow/core/public/tensor_c_api.h"
+%include "tensorflow/c/c_api.h"
 %ignoreall
 
 %insert("python") %{
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index a203a7539af..68ef4920ced 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -17,13 +17,13 @@ limitations under the License.
 
 #include <cstring>
 
+#include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/equal_graph_def.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/tf_status_helper.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index 591b8774b64..83cab586d8b 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -19,11 +19,11 @@ limitations under the License.
 // Must be included first
 #include "tensorflow/python/lib/core/numpy.h"
 
+#include "tensorflow/c/c_api.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/public/tensor_c_api.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/python/debug/session_debug_test.py b/tensorflow/python/debug/session_debug_test.py
new file mode 100644
index 00000000000..d9fdb240c9d
--- /dev/null
+++ b/tensorflow/python/debug/session_debug_test.py
@@ -0,0 +1,298 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for debugger functionalities in tf.Session."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import os
+import shutil
+import tempfile
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.util import event_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+class SessionDebugTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self.dump_root_ = tempfile.mkdtemp()
+
+  def tearDown(self):
+    # Tear down temporary dump directory.
+    shutil.rmtree(self.dump_root_)
+
+  def _addDebugTensorWatch(self,
+                           run_opts,
+                           node_name,
+                           output_slot,
+                           debug_op="DebugIdentity",
+                           debug_urls=None):
+    watch_opts = run_opts.debug_tensor_watch_opts
+
+    # Add debug tensor watch for u.
+    watch = watch_opts.add()
+    watch.node_name = node_name
+    watch.output_slot = 0
+    watch.debug_ops.append(debug_op)
+
+    if debug_urls:
+      for debug_url in debug_urls:
+        watch.debug_urls.append(debug_url)
+
+  def _verifyTensorDumpFile(self, dump_file, expected_tensor_name, debug_op,
+                            wall_time_lower_bound, expected_tensor_val):
+    """Helper method: Verify a Tensor debug dump file and its content.
+
+    Args:
+      dump_file: Path to the dump file.
+      expected_tensor_name: Expected name of the tensor, e.g., node_a:0.
+      debug_op: Name of the debug Op, e.g., DebugIdentity.
+      wall_time_lower_bound: Lower bound of the wall time.
+      expected_tensor_val: Expected tensor value, as a numpy array.
+    """
+    self.assertTrue(os.path.isfile(dump_file))
+
+    event = event_pb2.Event()
+    f = open(dump_file, "rb")
+    event.ParseFromString(f.read())
+
+    wall_time = event.wall_time
+    debg_node_name = event.summary.value[0].node_name
+
+    tensor_value = tensor_util.MakeNdarray(event.summary.value[0].tensor)
+
+    self.assertGreater(wall_time, wall_time_lower_bound)
+    self.assertEqual("%s:%s" % (expected_tensor_name, debug_op), debg_node_name)
+
+    if expected_tensor_val.dtype.type is np.string_:
+      self.assertEqual(str(expected_tensor_val), str(tensor_value))
+    else:
+      self.assertAllClose(expected_tensor_val, tensor_value)
+
+  def testDumpToFileOverlaoppinpParentDir(self):
+    with session.Session() as sess:
+      u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
+      v_init_val = np.array([[2.0], [-1.0]])
+
+      # Use node names with overlapping namespace (i.e., parent directory) to
+      # test concurrent, non-racing directory creation.
+      u_name = "testDumpToFile/u"
+      v_name = "testDumpToFile/v"
+
+      u_init = constant_op.constant(u_init_val, shape=[2, 2])
+      u = variables.Variable(u_init, name=u_name)
+      v_init = constant_op.constant(v_init_val, shape=[2, 1])
+      v = variables.Variable(v_init, name=v_name)
+
+      w = math_ops.matmul(u, v, name="testDumpToFile/matmul")
+
+      u.initializer.run()
+      v.initializer.run()
+
+      run_options = config_pb2.RunOptions()
+      debug_url = "file://%s" % self.dump_root_
+
+      # Add debug tensor watch for u.
+      self._addDebugTensorWatch(
+          run_options, "%s/read" % u_name, 0, debug_urls=[debug_url])
+      # Add debug tensor watch for v.
+      self._addDebugTensorWatch(
+          run_options, "%s/read" % v_name, 0, debug_urls=[debug_url])
+
+      run_metadata = config_pb2.RunMetadata()
+
+      # Invoke Session.run().
+      sess.run(w, options=run_options, run_metadata=run_metadata)
+
+      # Verify the dump file for u.
+      dump_files = os.listdir(os.path.join(self.dump_root_, u_name))
+      self.assertEqual(1, len(dump_files))
+      self.assertTrue(dump_files[0].startswith("read_0_"))
+
+      dump_file = os.path.join(self.dump_root_, u_name, dump_files[0])
+      self._verifyTensorDumpFile(dump_file, "%s/read:0" % u_name,
+                                 "DebugIdentity", 0, u_init_val)
+
+      # Verify the dump file for v.
+      dump_files = os.listdir(os.path.join(self.dump_root_, v_name))
+      self.assertEqual(1, len(dump_files))
+      self.assertTrue(dump_files[0].startswith("read_0_"))
+
+      dump_file = os.path.join(self.dump_root_, v_name, dump_files[0])
+      self._verifyTensorDumpFile(dump_file, "%s/read:0" % v_name,
+                                 "DebugIdentity", 0, v_init_val)
+
+  def testDumpStringTensorsToFileSystem(self):
+    with session.Session() as sess:
+      str1_init_val = np.array(b"abc")
+      str2_init_val = np.array(b"def")
+
+      str1_init = constant_op.constant(str1_init_val)
+      str2_init = constant_op.constant(str2_init_val)
+
+      str1_name = "str1"
+      str2_name = "str2"
+      str1 = variables.Variable(str1_init, name=str1_name)
+      str2 = variables.Variable(str2_init, name=str2_name)
+      # Concatenate str1 and str2
+      str_concat = math_ops.add(str1, str2, name="str_concat")
+
+      str1.initializer.run()
+      str2.initializer.run()
+
+      run_options = config_pb2.RunOptions()
+      debug_url = "file://%s" % self.dump_root_
+
+      # Add debug tensor watch for u.
+      self._addDebugTensorWatch(
+          run_options, "%s/read" % str1_name, 0, debug_urls=[debug_url])
+      # Add debug tensor watch for v.
+      self._addDebugTensorWatch(
+          run_options, "%s/read" % str2_name, 0, debug_urls=[debug_url])
+
+      run_metadata = config_pb2.RunMetadata()
+
+      # Invoke Session.run().
+      sess.run(str_concat, options=run_options, run_metadata=run_metadata)
+
+      # Verify the dump file for str1.
+      dump_files = os.listdir(os.path.join(self.dump_root_, str1_name))
+      self.assertEqual(1, len(dump_files))
+      self.assertTrue(dump_files[0].startswith("read_0_"))
+      dump_file = os.path.join(self.dump_root_, str1_name, dump_files[0])
+      self._verifyTensorDumpFile(dump_file, "%s/read:0" % str1_name,
+                                 "DebugIdentity", 0, str1_init_val)
+
+      # Verify the dump file for str2.
+      dump_files = os.listdir(os.path.join(self.dump_root_, str2_name))
+      self.assertEqual(1, len(dump_files))
+      self.assertTrue(dump_files[0].startswith("read_0_"))
+      dump_file = os.path.join(self.dump_root_, str2_name, dump_files[0])
+      self._verifyTensorDumpFile(dump_file, "%s/read:0" % str2_name,
+                                 "DebugIdentity", 0, str2_init_val)
+
+  def testDumpToFileWhileLoop(self):
+    with session.Session() as sess:
+      num_iter = 10
+
+      # "u" is the Variable being updated in the loop.
+      u_name = "testDumpToFileWhileLoop/u"
+      u_namespace = u_name.split("/")[0]
+
+      u_init_val = np.array(11.0)
+      u_init = constant_op.constant(u_init_val)
+      u = variables.Variable(u_init, name=u_name)
+
+      # "v" is the increment.
+      v_name = "testDumpToFileWhileLoop/v"
+      v_namespace = v_name.split("/")[0]
+
+      v_init_val = np.array(2.0)
+      v_init = constant_op.constant(v_init_val)
+      v = variables.Variable(v_init, name=v_name)
+
+      u.initializer.run()
+      v.initializer.run()
+
+      i = constant_op.constant(0, name="testDumpToFileWhileLoop/i")
+
+      def cond(i):
+        return math_ops.less(i, num_iter)
+
+      def body(i):
+        new_u = state_ops.assign_add(u, v)
+        new_i = math_ops.add(i, 1)
+        op = control_flow_ops.group(new_u)
+        new_i = control_flow_ops.with_dependencies([op], new_i)
+        return [new_i]
+
+      loop = control_flow_ops.while_loop(cond, body, [i], parallel_iterations=1)
+
+      # Create RunOptions for debug-watching tensors
+      run_options = config_pb2.RunOptions()
+      debug_url = "file://%s" % self.dump_root_
+
+      # Add debug tensor watch for u.
+      self._addDebugTensorWatch(run_options, u_name, 0, debug_urls=[debug_url])
+      # Add debug tensor watch for v.
+      self._addDebugTensorWatch(
+          run_options, "%s/read" % v_name, 0, debug_urls=[debug_url])
+      # Add debug tensor watch for while/Identity.
+      self._addDebugTensorWatch(
+          run_options, "while/Identity", 0, debug_urls=[debug_url])
+
+      run_metadata = config_pb2.RunMetadata()
+
+      r = sess.run(loop, options=run_options, run_metadata=run_metadata)
+
+      self.assertEqual(num_iter, r)
+
+      u_val_final = sess.run(u)
+      self.assertAllClose(u_init_val + num_iter * v_init_val, u_val_final)
+
+      # Verify dump files
+      self.assertTrue(os.path.isdir(self.dump_root_))
+
+      self.assertTrue(os.path.isdir(os.path.join(self.dump_root_, u_namespace)))
+      self.assertTrue(
+          os.path.isdir(os.path.join(self.dump_root_, v_namespace, "v")))
+
+      # Verify the dump file for tensor "u".
+      dump_files = glob.glob(
+          os.path.join(self.dump_root_, u_namespace, "u_0_*"))
+      self.assertEqual(1, len(dump_files))
+      dump_file = os.path.join(self.dump_root_, u_namespace, dump_files[0])
+      self.assertTrue(os.path.isfile(dump_file))
+      self._verifyTensorDumpFile(dump_file, "%s:0" % u_name, "DebugIdentity", 0,
+                                 u_init_val)
+
+      # Verify the dump file for tensor "v".
+      dump_files = os.listdir(os.path.join(self.dump_root_, v_name))
+      self.assertEqual(1, len(dump_files))
+      self.assertTrue(dump_files[0].startswith("read_0_"))
+
+      dump_file = os.path.join(self.dump_root_, v_name, dump_files[0])
+      self._verifyTensorDumpFile(dump_file, "%s/read:0" % v_name,
+                                 "DebugIdentity", 0, v_init_val)
+
+      # Verify the dump files for tensor while/Identity
+      while_identity_dump_files = sorted(
+          os.listdir(os.path.join(self.dump_root_, "while")))
+      self.assertEqual(num_iter, len(while_identity_dump_files))
+
+      # Verify the content of the individual
+      for k in xrange(len(while_identity_dump_files)):
+        dump_file_path = os.path.join(self.dump_root_, "while",
+                                      while_identity_dump_files[k])
+        self._verifyTensorDumpFile(dump_file_path, "while/Identity:0",
+                                   "DebugIdentity", 0, np.array(k))
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 1f29426b4cf..92ae917b648 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -240,8 +240,13 @@ class DType(object):
 
   def __eq__(self, other):
     """Returns True iff this DType refers to the same type as `other`."""
-    return (other is not None
-            and self._type_enum == as_dtype(other).as_datatype_enum)
+    if other is None:
+      return False
+    try:
+      dtype = as_dtype(other).as_datatype_enum
+      return self._type_enum == dtype
+    except TypeError:
+      return False
 
   def __ne__(self, other):
     """Returns True iff self != other."""
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index 42d3be90185..7fa8ef618b3 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -247,6 +247,9 @@ class TypesTest(test_util.TensorFlowTestCase):
       self.assertEquals(type(dtype2), tf.DType)
       self.assertEquals(dtype, dtype2)
 
+  def testEqWithNonTFTypes(self):
+    self.assertNotEqual(tf.int32, int)
+    self.assertNotEqual(tf.float64, 2.1)
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/framework_lib.py b/tensorflow/python/framework/framework_lib.py
index b06605cf592..3f77187a25c 100644
--- a/tensorflow/python/framework/framework_lib.py
+++ b/tensorflow/python/framework/framework_lib.py
@@ -72,6 +72,7 @@ from tensorflow.python.framework.device import DeviceSpec
 from tensorflow.python.framework.ops import Graph
 from tensorflow.python.framework.ops import Operation
 from tensorflow.python.framework.ops import Tensor
+from tensorflow.python.framework.ops import Output
 from tensorflow.python.framework.ops import SparseTensor
 from tensorflow.python.framework.ops import SparseTensorValue
 from tensorflow.python.framework.ops import IndexedSlices
diff --git a/tensorflow/python/framework/gen_docs_combined.py b/tensorflow/python/framework/gen_docs_combined.py
index bfea7b6aca7..63557302103 100644
--- a/tensorflow/python/framework/gen_docs_combined.py
+++ b/tensorflow/python/framework/gen_docs_combined.py
@@ -60,11 +60,13 @@ def get_module_to_name():
       tf.contrib.distributions: "tf.contrib.distributions",
       tf.contrib.ffmpeg: "tf.contrib.ffmpeg",
       tf.contrib.framework: "tf.contrib.framework",
+      tf.contrib.graph_editor: "tf.contrib.graph_editor",
       tf.contrib.layers: "tf.contrib.layers",
       tf.contrib.learn: "tf.contrib.learn",
       tf.contrib.learn.monitors: (
           "tf.contrib.learn.monitors"),
       tf.contrib.losses: "tf.contrib.losses",
+      tf.contrib.rnn: "tf.contrib.rnn",
       tf.contrib.metrics: "tf.contrib.metrics",
       tf.contrib.util: "tf.contrib.util",
   }
@@ -118,7 +120,7 @@ def all_libraries(module_to_name, members, documented):
       library("tensor_array_ops", "TensorArray Operations", prefix=PREFIX_TEXT),
       library("session_ops", "Tensor Handle Operations", prefix=PREFIX_TEXT),
       library("image", "Images", tf.image, exclude_symbols=["ResizeMethod"],
-               prefix=PREFIX_TEXT),
+              prefix=PREFIX_TEXT),
       library("sparse_ops",
               "Sparse Tensors",
               exclude_symbols=["serialize_sparse", "serialize_many_sparse",
@@ -166,16 +168,19 @@ def all_libraries(module_to_name, members, documented):
               tf.contrib.distributions),
       library("contrib.ffmpeg", "FFmpeg (contrib)", ffmpeg),
       library("contrib.framework", "Framework (contrib)", tf.contrib.framework),
+      library("contrib.graph_editor", "Graph Editor (contrib)",
+              tf.contrib.graph_editor),
       library("contrib.layers", "Layers (contrib)", tf.contrib.layers),
       library("contrib.learn", "Learn (contrib)", tf.contrib.learn),
       library("contrib.learn.monitors", "Monitors (contrib)",
               tf.contrib.learn.monitors),
       library("contrib.losses", "Losses (contrib)", tf.contrib.losses),
+      library("contrib.rnn", "RNN (contrib)", tf.contrib.rnn),
       library("contrib.metrics", "Metrics (contrib)", tf.contrib.metrics),
       library("contrib.util", "Utilities (contrib)", tf.contrib.util),
       library("contrib.copy_graph", "Copying Graph Elements (contrib)",
               tf.contrib.copy_graph),
-    ]
+  ]
 
 _hidden_symbols = ["Event", "LogMessage", "Summary", "SessionLog", "xrange",
                    "HistogramProto", "ConfigProto", "NodeDef", "GraphDef",
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index f89f3d46972..854d46b955e 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -185,7 +185,10 @@ def register_dense_tensor_like_type(tensor_type):
 
 
 class Tensor(object):
-  """Represents a value produced by an `Operation`.
+  """Represents one of the outputs of an `Operation`.
+
+  *Note:* the `Tensor` class will be replaced by `Output` in the future.
+  Currently these two are aliases for each other.
 
   A `Tensor` is a symbolic handle to one of the outputs of an
   `Operation`. It does not hold the values of that operation's output,
@@ -556,6 +559,10 @@ class Tensor(object):
     return _eval_using_default_session(self, feed_dict, self.graph, session)
 
 
+# TODO(josh11b): Switch everyone from "Tensor" to "Output" to match C++ API.
+Output = Tensor
+
+
 def _TensorTensorConversionFunction(t, dtype=None, name=None, as_ref=False):
   _ = name, as_ref
   if dtype and not dtype.is_compatible_with(t.dtype):
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 098558fd3e3..47341775cf6 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -382,7 +382,8 @@ def make_tensor_proto(values, dtype=None, shape=None):
   if is_quantized:
     numpy_dtype = dtype
 
-  if dtype is not None and not dtype.base_dtype == numpy_dtype.base_dtype:
+  if dtype is not None and (not hasattr(dtype, "base_dtype") or
+                            dtype.base_dtype != numpy_dtype.base_dtype):
     raise TypeError("Incompatible types: %s vs. %s" % (dtype, nparray.dtype))
 
   # If shape is not given, get the shape from the numpy array.
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index a832fcc643a..b9a940efe3a 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
-REGISTER_OP("KernelLabel").Output("result: string");
+REGISTER_OP("KernelLabel")
+    .Output("result: string")
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("GraphDefVersion").Output("version: int32").SetIsStateful();
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 630c72fa5c6..3342617cad5 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -502,23 +502,26 @@ class TensorFlowTestCase(googletest.TestCase):
                                      expected_err_re_or_predicate):
     """Returns a context manager to enclose code expected to raise an exception.
 
+    If the exception is an OpError, the op stack is also included in the message
+    predicate search.
+
     Args:
       exception_type: The expected type of exception that should be raised.
       expected_err_re_or_predicate: If this is callable, it should be a function
-        of one argument that inspects the passed-in OpError exception and
+        of one argument that inspects the passed-in exception and
         returns True (success) or False (please fail the test). Otherwise, the
         error message is expected to match this regular expression partially.
 
     Returns:
       A context manager to surround code that is expected to raise an
-      errors.OpError exception.
+      exception.
     """
     if callable(expected_err_re_or_predicate):
       predicate = expected_err_re_or_predicate
     else:
       def predicate(e):
-        err_str = e.message
-        op = e.op
+        err_str = e.message if isinstance(e, errors.OpError) else str(e)
+        op = e.op if isinstance(e, errors.OpError) else None
         while op is not None:
           err_str += "\nCaused by: " + op.name
           op = op._original_op
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 678a7452999..8532fe3ecf0 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -11,6 +11,7 @@ package(
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_tests")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
@@ -32,7 +33,6 @@ py_tests(
         "decode_png_op_test.py",
         "decode_raw_op_test.py",
         "determinant_op_test.py",
-        "diag_op_test.py",
         "edit_distance_op_test.py",
         "fifo_queue_test.py",
         "identity_op_py_test.py",
@@ -71,6 +71,13 @@ py_tests(
     ],
 )
 
+cuda_py_tests(
+    name = "cast_op_test",
+    size = "small",
+    srcs = ["cast_op_test.py"],
+    tags = ["noasan"],
+)
+
 cuda_py_test(
     name = "dense_update_ops_no_tsan_test",
     size = "small",
@@ -78,6 +85,13 @@ cuda_py_test(
     tags = ["notsan"],
 )
 
+tf_py_test(
+    name = "diag_op_test",
+    size = "medium",
+    srcs = ["diag_op_test.py"],
+    shard_count = 2,
+)
+
 py_tests(
     name = "reader_ops_test",
     size = "small",
@@ -87,13 +101,6 @@ py_tests(
     ],
 )
 
-cuda_py_tests(
-    name = "cast_op_test",
-    size = "small",
-    srcs = ["cast_op_test.py"],
-    tags = ["noasan"],
-)
-
 cuda_py_tests(
     name = "kernel_tests",
     size = "small",
@@ -108,14 +115,12 @@ cuda_py_tests(
         "constant_op_test.py",
         "control_flow_ops_py_test.py",
         "conv1d_test.py",
-        "conv2d_backprop_filter_grad_test.py",
         "conv2d_transpose_test.py",
         "conv3d_backprop_filter_v2_grad_test.py",
         "cross_grad_test.py",
         "denormal_test.py",
         "dense_update_ops_test.py",
         "depthtospace_op_test.py",
-        "depthwise_conv_op_test.py",
         "division_past_test.py",
         "dynamic_partition_op_test.py",
         "dynamic_stitch_op_test.py",
@@ -135,7 +140,6 @@ cuda_py_tests(
         "pack_op_test.py",
         "pad_op_test.py",
         "padding_fifo_queue_test.py",
-        "pooling_ops_3d_test.py",
         "py_func_test.py",
         "random_crop_test.py",
         "random_ops_test.py",
@@ -177,11 +181,14 @@ cuda_py_tests(
     name = "medium_kernel_tests",
     size = "medium",
     srcs = [
-        "concat_op_test.py",
+        "atrous_conv2d_test.py",
+        "conv2d_backprop_filter_grad_test.py",
         "conv3d_transpose_test.py",
         "conv_ops_test.py",
+        "depthwise_conv_op_test.py",  # http://b/30603882
         "division_future_test.py",
         "fft_ops_test.py",
+        "pooling_ops_3d_test.py",  # http://b/30600785
         "pooling_ops_test.py",
         "random_gamma_test.py",
         "rnn_test.py",
@@ -194,6 +201,13 @@ cuda_py_tests(
     ],
 )
 
+cuda_py_test(
+    name = "concat_op_test",
+    size = "medium",
+    srcs = ["concat_op_test.py"],
+    tags = ["notsan"],  # http://b/30445083
+)
+
 cuda_py_tests(
     name = "kernel_tests_with_sharding",
     size = "medium",
@@ -203,6 +217,7 @@ cuda_py_tests(
         "cwise_ops_test.py",
         "embedding_ops_test.py",
         "linalg_grad_test.py",
+        "svd_op_test.py",
     ],
     shard_count = 50,
     tags = ["notap"],  # b/30226163
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index d908f9a7f67..8a6ba3615a1 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -278,12 +278,10 @@ class StridedSliceChecker(object):
     self.x_np = np.array(x)
 
   def __getitem__(self, spec):
-    # TODO(aselle): When NewSliceHelper is installed, we can switch this back
-    # op = self.x[spec]
-    op = array_ops._NewSliceHelper(self.x, spec)
+    op = self.x.__getitem__(spec)
 
     tensor = op.eval()
-    self.test.assertAllEqual(self.x_np[spec], tensor)
+    self.test.assertAllEqual(self.x_np.__getitem__(spec), tensor)
     self.test.assertAllEqual(tensor.shape, op.get_shape())
     return tensor
 
@@ -296,9 +294,8 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
                         tf.float64]:
       for use_gpu in [False, True]:
         with self.test_session(use_gpu=use_gpu):
-          checker = StridedSliceChecker(self,
-                                        StridedSliceChecker.REF_TENSOR,
-                                        tensor_type=tensor_type)
+          checker = StridedSliceChecker(
+              self, StridedSliceChecker.REF_TENSOR, tensor_type=tensor_type)
           _ = checker[:, :, :]
           # Various ways of representing identity slice
           _ = checker[:, :, :]
@@ -400,9 +397,7 @@ class StridedSliceShapeChecker(object):
     self.x = x
 
   def __getitem__(self, spec):
-    # TODO(aselle): When NewSliceHelper is installed, we can switch this back
-    # op = self.x[spec]
-    op = array_ops._NewSliceHelper(self.x, spec)
+    op = self.x.__getitem__(spec)
     return op.get_shape()
 
 
@@ -456,22 +451,28 @@ class GradSliceChecker(object):
     self.varnp = varnp
 
   def __getitem__(self, spec):
-    val_grad_op = tf.gradients(self.val, self.var)
-    sliceval_grad_op = tf.gradients(
-        array_ops._NewSliceHelper(self.val, spec), self.var)
-    slice1_op = array_ops._NewSliceHelper(val_grad_op, spec)
-    slice2_op = array_ops._NewSliceHelper(sliceval_grad_op, spec)
-    val_grad, sliceval_grad, slice1, slice2 = self.sess.run(
-        [val_grad_op, sliceval_grad_op, slice1_op, slice2_op])
-    np_val_grad = (2 * self.varnp)
+    slice_var = self.var[spec]
+    slice_val = self.val[spec]
+
+    # compute analytic 2nd derivative
+    analytic_grad2 = 2 * slice_val
+
+    dy = tf.Variable(tf.ones(shape=slice_var.get_shape(), dtype=tf.int32))
+    assign = dy.assign(slice_var)
+    slice_val_grad, = tf.gradients(slice_val, self.var, grad_ys=dy)
+    slice_val_grad2, = tf.gradients(slice_val_grad, dy, grad_ys=self.var)
+    self.sess.run(assign)
+    slice_val_grad_evaled, slice_val_grad2_evaled = (
+        self.sess.run([slice_val_grad, slice_val_grad2]))
+    analytic_grad2_evaled = analytic_grad2.eval()
+    self.test.assertAllEqual(slice_val_grad2_evaled, analytic_grad2_evaled)
+
+    # compute analytic gradient for slice
+    np_val_grad = (2 * self.varnp * self.varnp)
     np_sliceval_grad = np.zeros(self.var.get_shape())
-    np_sliceval_grad[spec] = np.array(val_grad[0])[spec]
-    # make sure np val grad is correct
-    self.test.assertAllEqual(np_val_grad, val_grad[0])
-    # make sure slice gradient is correct
-    self.test.assertAllEqual(np_sliceval_grad, sliceval_grad[0])
-    # make sure val grad and sliceval grad are the same in sliced area
-    self.test.assertAllEqual(slice1, slice2)
+    np_sliceval_grad[spec] = np_val_grad[spec]
+    # verify gradient
+    self.test.assertAllEqual(slice_val_grad_evaled, np_sliceval_grad)
 
 
 class StridedSliceGradTest(test_util.TensorFlowTestCase):
@@ -492,13 +493,59 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
         _ = grad[3:0:-2, 1:3, 2]
 
 
+class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
+  """Test varied index types and host located memory."""
+
+  def testHostVsDevice(self):
+    with self.test_session(use_gpu=True) as sess:
+      var2 = tf.Variable(
+          tf.reshape(
+              tf.cast(tf.range(1, 5, 1), tf.float32), shape=(4, 1, 1)))
+      varshape = tf.Variable([6, 4, 4], dtype=tf.int32)
+      sess.run(tf.initialize_all_variables())
+      begin = tf.constant([0, 0, 0])
+      end = tf.constant([4, 1, 1])
+      strides = tf.constant([1, 1, 1])
+      foo = array_ops.strided_slice_grad(varshape, begin, end, strides, var2)
+      sess.run(foo)
+
+  def testInt64Shape(self):
+    with self.test_session(use_gpu=True) as sess:
+      original_dy = tf.reshape(
+          tf.cast(tf.range(1, 5, 1), tf.float32), shape=(4, 1, 1))
+      original_shape = tf.constant([6, 4, 4], dtype=tf.int64)
+      sess.run(tf.initialize_all_variables())
+      begin = tf.constant([0, 0, 0], dtype=tf.int64)
+      end = tf.constant([4, 1, 1], dtype=tf.int64)
+      strides = tf.constant([1, 1, 1], dtype=tf.int64)
+      dx = array_ops.strided_slice_grad(original_shape, begin, end, strides,
+                                        original_dy)
+      sess.run(dx)
+
+  def testMixedIndexTypes(self):
+    with self.test_session(use_gpu=True) as sess:
+      original_dy = tf.reshape(
+          tf.cast(tf.range(1, 5, 1), tf.float32), shape=(4, 1, 1))
+      original_shape = tf.constant([6, 4, 4], dtype=tf.int64)
+      sess.run(tf.initialize_all_variables())
+      begin = tf.constant([0, 0, 0], dtype=tf.int32)
+      end = tf.constant([4, 1, 1], dtype=tf.int64)
+      strides = tf.constant([1, 1, 1], dtype=tf.int64)
+      with self.assertRaisesRegexp(
+          TypeError, "Input 'begin' of 'StridedSliceGrad' Op has type int32"
+          " that does not match type int64 of argument 'shape'"):
+        dx = array_ops.strided_slice_grad(original_shape, begin, end, strides,
+                                          original_dy)
+        sess.run(dx)
+
+
 class BenchmarkSlice(object):
 
   def __init__(self, tensor):
     self.tensor = tensor
 
   def __getitem__(self, x):
-    return array_ops._NewSliceHelper(self.tensor, x)
+    return self.tensor[x]
 
 
 class StridedSliceBenchmark(tf.test.Benchmark):
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 182f2652560..e93bada771d 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -70,9 +70,10 @@ class AssertEqualTest(tf.test.TestCase):
     with self.test_session():
       small = tf.constant([1, 2], name="small")
       big = tf.constant([3, 4], name="big")
-      with tf.control_dependencies([tf.assert_equal(big, small)]):
+      with tf.control_dependencies(
+          [tf.assert_equal(big, small, message="fail")]):
         out = tf.identity(small)
-      with self.assertRaisesOpError("big.*small"):
+      with self.assertRaisesOpError("fail.*big.*small"):
         out.eval()
 
   def test_raises_when_less(self):
@@ -115,9 +116,10 @@ class AssertLessTest(tf.test.TestCase):
   def test_raises_when_equal(self):
     with self.test_session():
       small = tf.constant([1, 2], name="small")
-      with tf.control_dependencies([tf.assert_less(small, small)]):
+      with tf.control_dependencies(
+          [tf.assert_less(small, small, message="fail")]):
         out = tf.identity(small)
-      with self.assertRaisesOpError("small.*small"):
+      with self.assertRaisesOpError("fail.*small.*small"):
         out.eval()
 
   def test_raises_when_greater(self):
@@ -176,9 +178,10 @@ class AssertLessEqualTest(tf.test.TestCase):
     with self.test_session():
       small = tf.constant([1, 2], name="small")
       big = tf.constant([3, 4], name="big")
-      with tf.control_dependencies([tf.assert_less_equal(big, small)]):
+      with tf.control_dependencies(
+          [tf.assert_less_equal(big, small, message="fail")]):
         out = tf.identity(small)
-      with self.assertRaisesOpError("big.*small"):
+      with self.assertRaisesOpError("fail.*big.*small"):
         out.eval()
 
   def test_doesnt_raise_when_less_equal(self):
@@ -227,9 +230,9 @@ class AssertNegativeTest(tf.test.TestCase):
   def test_raises_when_positive(self):
     with self.test_session():
       doug = tf.constant([1, 2], name="doug")
-      with tf.control_dependencies([tf.assert_negative(doug)]):
+      with tf.control_dependencies([tf.assert_negative(doug, message="fail")]):
         out = tf.identity(doug)
-      with self.assertRaisesOpError("doug"):
+      with self.assertRaisesOpError("fail.*doug"):
         out.eval()
 
   def test_raises_when_zero(self):
@@ -257,9 +260,10 @@ class AssertPositiveTest(tf.test.TestCase):
   def test_raises_when_negative(self):
     with self.test_session():
       freddie = tf.constant([-1, -2], name="freddie")
-      with tf.control_dependencies([tf.assert_positive(freddie)]):
+      with tf.control_dependencies(
+          [tf.assert_positive(freddie, message="fail")]):
         out = tf.identity(freddie)
-      with self.assertRaisesOpError("freddie"):
+      with self.assertRaisesOpError("fail.*freddie"):
         out.eval()
 
   def test_doesnt_raise_when_positive(self):
@@ -295,16 +299,19 @@ class AssertRankTest(tf.test.TestCase):
     with self.test_session():
       tensor = tf.constant(1, name="my_tensor")
       desired_rank = 1
-      with self.assertRaisesRegexp(ValueError, "my_tensor.*must have rank 1"):
-        with tf.control_dependencies([tf.assert_rank(tensor, desired_rank)]):
+      with self.assertRaisesRegexp(
+          ValueError, "fail.*my_tensor.*must have rank 1"):
+        with tf.control_dependencies(
+            [tf.assert_rank(tensor, desired_rank, message="fail")]):
           tf.identity(tensor).eval()
 
   def test_rank_zero_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.test_session():
       tensor = tf.placeholder(tf.float32, name="my_tensor")
       desired_rank = 1
-      with tf.control_dependencies([tf.assert_rank(tensor, desired_rank)]):
-        with self.assertRaisesOpError("my_tensor.*rank"):
+      with tf.control_dependencies(
+          [tf.assert_rank(tensor, desired_rank, message="fail")]):
+        with self.assertRaisesOpError("fail.*my_tensor.*rank"):
           tf.identity(tensor).eval(feed_dict={tensor: 0})
 
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_static_rank(self):
@@ -384,7 +391,7 @@ class AssertRankTest(tf.test.TestCase):
   def test_raises_if_rank_is_not_integer_static(self):
     with self.test_session():
       tensor = tf.constant([1, 2], name="my_tensor")
-      with self.assertRaisesRegexp(ValueError,
+      with self.assertRaisesRegexp(TypeError,
                                    "must be of type <dtype: 'int32'>"):
         tf.assert_rank(tensor, .5)
 
@@ -392,7 +399,7 @@ class AssertRankTest(tf.test.TestCase):
     with self.test_session():
       tensor = tf.constant([1, 2], dtype=tf.float32, name="my_tensor")
       rank_tensor = tf.placeholder(tf.float32, name="rank_tensor")
-      with self.assertRaisesRegexp(ValueError,
+      with self.assertRaisesRegexp(TypeError,
                                    "must be of type <dtype: 'int32'>"):
         with tf.control_dependencies([tf.assert_rank(tensor, rank_tensor)]):
           tf.identity(tensor).eval(feed_dict={rank_tensor: .5})
@@ -555,10 +562,8 @@ class AssertIntegerTest(tf.test.TestCase):
   def test_raises_when_float(self):
     with self.test_session():
       floats = tf.constant([1.0, 2.0], name="floats")
-      with tf.control_dependencies([tf.assert_integer(floats)]):
-        out = tf.identity(floats)
-      with self.assertRaisesOpError("x is not of integer dtype.*"):
-        out.eval()
+      with self.assertRaisesRegexp(TypeError, "Expected.*integer"):
+        tf.assert_integer(floats)
 
 
 class IsStrictlyIncreasingTest(tf.test.TestCase):
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index a8ae70e49b0..159305f78bb 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -716,10 +716,20 @@ class ControlFlowTest(tf.test.TestCase):
   def testWhileWithControl_3(self):
     with self.test_session() as sess:
       b = tf.placeholder(tf.bool)
-      c = tf.constant(0)
+      c = tf.constant(1)
+      x0 = tf.constant(0)
       with tf.control_dependencies([b]):
-        c = tf.while_loop(lambda x: x < 10, lambda x: x + 1, [c])
-      self.assertEqual(10, sess.run(c, {b: True}))
+        r = tf.while_loop(lambda x: x < 10, lambda x: x + c, [x0])
+      self.assertEqual(10, sess.run(r, {b: True}))
+
+  def testWhileWithControl_4(self):
+    with self.test_session() as sess:
+      b = tf.placeholder(tf.bool)
+      c = tf.constant(1)
+      x0 = tf.constant(0)
+      with tf.control_dependencies([b]):
+        r = tf.while_loop(lambda x: x < 10, lambda x: x + tf.identity(c), [x0])
+      self.assertEqual(10, sess.run(r, {b: True}))
 
   def testCondWhile_1(self):
     with self.test_session():
@@ -1236,6 +1246,27 @@ class ControlFlowTest(tf.test.TestCase):
       r = tf.gradients([rx], x)
       self.assertAllClose(64.0, r[0].eval())
 
+  def testWhileGrad_OneOutputWithControlDependencyOnSecond(self):
+    with self.test_session():
+      i = tf.constant(0, name="i")
+      x = tf.constant(1.0, name="x")
+      y = tf.constant(1.0, name="y")
+      c = lambda i, *_: tf.less(i, 1, name="cond_less")
+      def b(i, xi, yi):
+        # return (i + 1, xi, xi + yi)
+        return (tf.add(i, 1, name="inc"),
+                tf.identity(xi, name="xi"),
+                tf.add(xi, yi, name="xi_plus_yi"))
+
+      _, x_f, y_f = tf.while_loop(c, b, [i, x, y])
+      with tf.control_dependencies([x_f]):
+        y_f_d = tf.identity(y_f, name="y_f_d")
+
+      self.assertAllClose(2.0, y_f_d.eval())  # y_f_d = 1.0 + 1.0
+      g = tf.gradients([y_f_d], [x])[0]
+      self.assertTrue(g is not None)
+      self.assertAllClose(1.0, g.eval())  # y_f_d = x + 1.0, dy_f_d/dx = 1.0
+
   def _testNestedWhileGrad_Simple(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       v = tf.constant(1.0)
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index cd603932838..3d6ae377fe1 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -26,10 +26,9 @@ import tensorflow as tf
 class DepthToSpaceTest(tf.test.TestCase):
 
   def _testOne(self, inputs, block_size, outputs):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = tf.depth_to_space(tf.to_float(inputs), block_size)
-        self.assertAllEqual(x_tf.eval(), outputs)
+    with self.test_session():
+      x_tf = tf.depth_to_space(tf.to_float(inputs), block_size)
+      self.assertAllEqual(x_tf.eval(), outputs)
 
   def testBasic(self):
     x_np = [[[[1, 2, 3, 4]]]]
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index ae521c05f0a..bdc83ea6328 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -319,21 +319,21 @@ class DiagTest(tf.test.TestCase):
                   [[5.5 + 5.5j, 6.6 + 6.6j], [7.7 + 7.7j, 8.8 + 8.8j]]],
                   dtype = np.complex64)
     expected_ans = np.array(
-        [[[[[[1.1 + 1.1j, 0 + 0j], [0 + 0j, 0 + 0j]], 
+        [[[[[[1.1 + 1.1j, 0 + 0j], [0 + 0j, 0 + 0j]],
             [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-           [[[0 + 0j, 2.2 + 2.2j], [0 + 0j, 0 + 0j]], 
+           [[[0 + 0j, 2.2 + 2.2j], [0 + 0j, 0 + 0j]],
                [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]]],
-          [[[[0 + 0j, 0 + 0j], [3.3 + 3.3j, 0 + 0j]], 
+          [[[[0 + 0j, 0 + 0j], [3.3 + 3.3j, 0 + 0j]],
               [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-           [[[0 + 0j, 0 + 0j], [0 + 0j, 4.4 + 4.4j]], 
+           [[[0 + 0j, 0 + 0j], [0 + 0j, 4.4 + 4.4j]],
                [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]]]],
-         [[[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], 
+         [[[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
              [[5.5 + 5.5j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-           [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], 
+           [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
                [[0 + 0j, 6.6 + 6.6j], [0 + 0j, 0 + 0j]]]],
-          [[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], 
+          [[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
               [[0 + 0j, 0 + 0j], [7.7 + 7.7j, 0 + 0j]]],
-           [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], 
+           [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
                [[0 + 0j, 0 + 0j], [0 + 0j, 8.8 + 8.8j]]]]]],
            dtype = np.complex64)
     self.diagOp(x, np.complex64, expected_ans)
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
index 1edea3f1f25..54433420be6 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
@@ -40,16 +40,15 @@ class ExtractImagePatches(tf.test.TestCase):
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        out_tensor = tf.extract_image_patches(
-            tf.constant(image),
-            ksizes=ksizes,
-            strides=strides,
-            rates=rates,
-            padding=padding,
-            name="im2col")
-        self.assertAllClose(patches, out_tensor.eval())
+    with self.test_session():
+      out_tensor = tf.extract_image_patches(
+          tf.constant(image),
+          ksizes=ksizes,
+          strides=strides,
+          rates=rates,
+          padding=padding,
+          name="im2col")
+      self.assertAllClose(patches, out_tensor.eval())
 
   def testKsize1x1Stride1x1Rate1x1(self):
     """Verifies that for 1x1 kernel the output equals the input."""
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 403d86b8f4c..e73d61d2617 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -367,5 +367,22 @@ class FunctionalOpsTest(tf.test.TestCase):
     y = tf.scan(fn, x, initializer=initializer)
     self.assertIs(None, y.get_shape().dims)
 
+  def testScanVaryingShape(self):
+    with self.test_session() as sess:
+      x = tf.placeholder(dtype=tf.float32, shape=[None, 2])
+      x_t = tf.transpose(x)
+      # scan over dimension 0 (with shape None)
+      result = tf.scan(lambda a, x: a + x, x)
+      # scanned over transposed dimension 0 (with shape 2)
+      result_t = tf.scan(lambda a, x: a + x, x_t, infer_shape=False)
+      # ensure gradients can be calculated
+      result_grad = tf.gradients(result, [x])[0]
+      result_t_grad = tf.gradients(result_t, [x_t])[0]
+
+      # smoke test to ensure they all evaluate
+      sess.run([result, result_t, result_grad, result_t_grad],
+               feed_dict={x: [[1.0, 2.0]]})
+
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index d23e4777e07..7d5323e5cb9 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -276,42 +276,55 @@ class RangeTest(tf.test.TestCase):
 # TODO(vrv): move to sequence_ops_test?
 class LinSpaceTest(tf.test.TestCase):
 
+  def _gpu_modes(self):
+    if tf.test.is_gpu_available():
+      return [False, True]
+    else:
+      return [False]
+
   def _LinSpace(self, start, stop, num):
-    with self.test_session():
-      tf_ans = tf.linspace(start, stop, num, name="linspace")
-      self.assertEqual([num], tf_ans.get_shape())
-      return tf_ans.eval()
+    # NOTE(touts): Needs to pass a graph to get a new session each time.
+    with tf.Graph().as_default() as graph:
+      with self.test_session(graph=graph, force_gpu=self.force_gpu):
+        tf_ans = tf.linspace(start, stop, num, name="linspace")
+        self.assertEqual([num], tf_ans.get_shape())
+        return tf_ans.eval()
 
   def testPositive(self):
-    self.assertArrayNear(self._LinSpace(1., 5., 1), np.array([1.]), 1e-5)
-    self.assertArrayNear(self._LinSpace(1., 5., 2), np.array([1., 5.]), 1e-5)
-    self.assertArrayNear(self._LinSpace(1., 5., 3),
-                         np.array([1., 3., 5.]), 1e-5)
-    self.assertArrayNear(self._LinSpace(1., 5., 4),
-                         np.array([1., 7. / 3., 11. / 3., 5.]), 1e-5)
+    for self.force_gpu in self._gpu_modes():
+      self.assertArrayNear(self._LinSpace(1., 5., 1), np.array([1.]), 1e-5)
+      self.assertArrayNear(self._LinSpace(1., 5., 2), np.array([1., 5.]), 1e-5)
+      self.assertArrayNear(self._LinSpace(1., 5., 3),
+                           np.array([1., 3., 5.]), 1e-5)
+      self.assertArrayNear(self._LinSpace(1., 5., 4),
+                           np.array([1., 7. / 3., 11. / 3., 5.]), 1e-5)
 
   def testNegative(self):
-    self.assertArrayNear(self._LinSpace(-1., -5., 1), np.array([-1.]), 1e-5)
-    self.assertArrayNear(self._LinSpace(-1., -5., 2),
-                         np.array([-1., -5.]), 1e-5)
-    self.assertArrayNear(self._LinSpace(-1., -5., 3),
-                         np.array([-1., -3., -5.]), 1e-5)
-    self.assertArrayNear(self._LinSpace(-1., -5., 4),
-                         np.array([-1., -7. / 3., -11. / 3., -5.]), 1e-5)
+    for self.force_gpu in self._gpu_modes():
+      self.assertArrayNear(self._LinSpace(-1., -5., 1), np.array([-1.]), 1e-5)
+      self.assertArrayNear(self._LinSpace(-1., -5., 2),
+                           np.array([-1., -5.]), 1e-5)
+      self.assertArrayNear(self._LinSpace(-1., -5., 3),
+                           np.array([-1., -3., -5.]), 1e-5)
+      self.assertArrayNear(self._LinSpace(-1., -5., 4),
+                           np.array([-1., -7. / 3., -11. / 3., -5.]), 1e-5)
 
   def testNegativeToPositive(self):
-    self.assertArrayNear(self._LinSpace(-1., 5., 1), np.array([-1.]), 1e-5)
-    self.assertArrayNear(self._LinSpace(-1., 5., 2), np.array([-1., 5.]), 1e-5)
-    self.assertArrayNear(self._LinSpace(-1., 5., 3),
-                         np.array([-1., 2., 5.]), 1e-5)
-    self.assertArrayNear(self._LinSpace(-1., 5., 4),
-                         np.array([-1., 1., 3., 5.]), 1e-5)
+    for self.force_gpu in self._gpu_modes():
+      self.assertArrayNear(self._LinSpace(-1., 5., 1), np.array([-1.]), 1e-5)
+      self.assertArrayNear(self._LinSpace(-1., 5., 2), np.array([-1., 5.]),
+                           1e-5)
+      self.assertArrayNear(self._LinSpace(-1., 5., 3),
+                           np.array([-1., 2., 5.]), 1e-5)
+      self.assertArrayNear(self._LinSpace(-1., 5., 4),
+                           np.array([-1., 1., 3., 5.]), 1e-5)
 
   def testPoint(self):
-    self.assertArrayNear(self._LinSpace(5., 5., 1), np.array([5.]), 1e-5)
-    self.assertArrayNear(self._LinSpace(5., 5., 2), np.array([5.] * 2), 1e-5)
-    self.assertArrayNear(self._LinSpace(5., 5., 3), np.array([5.] * 3), 1e-5)
-    self.assertArrayNear(self._LinSpace(5., 5., 4), np.array([5.] * 4), 1e-5)
+    for self.force_gpu in self._gpu_modes():
+      self.assertArrayNear(self._LinSpace(5., 5., 1), np.array([5.]), 1e-5)
+      self.assertArrayNear(self._LinSpace(5., 5., 2), np.array([5.] * 2), 1e-5)
+      self.assertArrayNear(self._LinSpace(5., 5., 3), np.array([5.] * 3), 1e-5)
+      self.assertArrayNear(self._LinSpace(5., 5., 4), np.array([5.] * 4), 1e-5)
 
 
 class DeviceTest(tf.test.TestCase):
diff --git a/tensorflow/python/kernel_tests/one_hot_op_test.py b/tensorflow/python/kernel_tests/one_hot_op_test.py
index 913b5190a8b..9a9dbfe8c92 100644
--- a/tensorflow/python/kernel_tests/one_hot_op_test.py
+++ b/tensorflow/python/kernel_tests/one_hot_op_test.py
@@ -233,27 +233,54 @@ class OneHotTest(tf.test.TestCase):
             dtype=dtype,
             truth=[truth[0].T, truth[1].T])  # Do not transpose the batch
 
+  def _testEmpty(self, dtype):
+    indices = np.zeros((0, 16), dtype=np.int64)
+    depth = 3
+    on_value = np.asarray(1.0, dtype=dtype)
+    off_value = np.asarray(-1.0, dtype=dtype)
+    truth = np.empty((0, 16, 3), dtype=dtype)
+
+    # axis == -1
+    self._testBothOneHot(
+        indices=indices,
+        depth=depth,
+        on_value=on_value,
+        off_value=off_value,
+        dtype=dtype,
+        truth=truth)
+
+  def testHalfBatch(self):
+    self._testEmpty(np.float16)
+    self._testBatch(np.float16)
+    self._testDefaultValuesBatch(np.float16)
+    self._testValueTypeBatch(np.float16)
+
   def testFloatBatch(self):
+    self._testEmpty(np.float32)
     self._testBatch(np.float32)
     self._testDefaultValuesBatch(np.float32)
     self._testValueTypeBatch(np.float32)
 
   def testDoubleBatch(self):
+    self._testEmpty(np.float64)
     self._testBatch(np.float64)
     self._testDefaultValuesBatch(np.float64)
     self._testValueTypeBatch(np.float64)
 
   def testInt32Batch(self):
+    self._testEmpty(np.int32)
     self._testBatch(np.int32)
     self._testDefaultValuesBatch(np.int32)
     self._testValueTypeBatch(np.int32)
 
   def testInt64Batch(self):
+    self._testEmpty(np.int64)
     self._testBatch(np.int64)
     self._testDefaultValuesBatch(np.int64)
     self._testValueTypeBatch(np.int64)
 
   def testComplexBatch(self):
+    self._testEmpty(np.complex64)
     self._testBatch(np.complex64)
     # self._testDefaultValuesBatch(np.complex64)
     self._testValueTypeBatch(np.complex64)
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index b88ba668f82..5b94583a8aa 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -61,9 +61,9 @@ class PadOpTest(tf.test.TestCase):
             [[1, 1], [1, 2]],
             mode="symmetric"))
 
-  def _testPad(self, np_inputs, paddings, mode, use_gpu=False):
+  def _testPad(self, np_inputs, paddings, mode):
     np_val = self._npPad(np_inputs, paddings, mode=mode)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session():
       tf_val = tf.pad(np_inputs, paddings, mode=mode)
       out = tf_val.eval()
     self.assertAllEqual(np_val, out)
@@ -86,8 +86,8 @@ class PadOpTest(tf.test.TestCase):
 
   def _testAll(self, np_inputs, paddings):
     for mode in ("CONSTANT", "REFLECT", "SYMMETRIC"):
-      self._testPad(np_inputs, paddings, mode=mode, use_gpu=False)
-      self._testPad(np_inputs, paddings, mode=mode, use_gpu=True)
+      self._testPad(np_inputs, paddings, mode=mode)
+      self._testPad(np_inputs, paddings, mode=mode)
       if np_inputs.dtype == np.float32:
         self._testGradient(np_inputs, paddings, mode=mode)
 
@@ -189,12 +189,11 @@ class PadOpTest(tf.test.TestCase):
   def testScalars(self):
     paddings = np.zeros((0, 2), dtype=np.int32)
     inp = np.asarray(7)
-    for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
-        tf_val = tf.pad(inp, paddings)
-        out = tf_val.eval()
-      self.assertAllEqual(inp, out)
-      self.assertShapeEqual(inp, tf_val)
+    with self.test_session():
+      tf_val = tf.pad(inp, paddings)
+      out = tf_val.eval()
+    self.assertAllEqual(inp, out)
+    self.assertShapeEqual(inp, tf_val)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/kernel_tests/parsing_ops_test.py
index 58f6da9f976..52d3c0dde1a 100644
--- a/tensorflow/python/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/python/kernel_tests/parsing_ops_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tests for tensorflow.ops.parsing_ops."""
 
 from __future__ import absolute_import
@@ -46,13 +45,13 @@ def flatten(list_of_lists):
 
 def flatten_values_tensors_or_sparse(tensors_list):
   """Flatten each SparseTensor object into 3 Tensors for session.run()."""
-  return list(flatten([[v.indices, v.values, v.shape]
-                       if isinstance(v, tf.SparseTensor) else [v]
-                       for v in tensors_list]))
+  return list(
+      flatten([[v.indices, v.values, v.shape] if isinstance(v, tf.SparseTensor)
+               else [v] for v in tensors_list]))
 
 
-def _compare_output_to_expected(
-    tester, dict_tensors, expected_tensors, flat_output):
+def _compare_output_to_expected(tester, dict_tensors, expected_tensors,
+                                flat_output):
   tester.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys()))
 
   i = 0  # Index into the flattened output of session.run()
@@ -74,11 +73,11 @@ def _compare_output_to_expected(
 
 class ParseExampleTest(tf.test.TestCase):
 
-  def _test(
-      self, kwargs, expected_values=None, expected_err=None):
+  def _test(self, kwargs, expected_values=None, expected_err=None):
     with self.test_session() as sess:
       if expected_err:
-        with self.assertRaisesRegexp(expected_err[0], expected_err[1]):
+        with self.assertRaisesWithPredicateMatch(
+            expected_err[0], expected_err[1]):
           out = tf.parse_example(**kwargs)
           sess.run(flatten_values_tensors_or_sparse(out.values()))
       else:
@@ -92,9 +91,8 @@ class ParseExampleTest(tf.test.TestCase):
       # Check shapes; if serialized is a Tensor we need its size to
       # properly check.
       serialized = kwargs["serialized"]
-      batch_size = (
-          serialized.eval().size if isinstance(serialized, tf.Tensor)
-          else np.asarray(serialized).size)
+      batch_size = (serialized.eval().size if isinstance(serialized, tf.Tensor)
+                    else np.asarray(serialized).size)
       for k, f in kwargs["features"].items():
         if isinstance(f, tf.FixedLenFeature) and f.shape is not None:
           self.assertEqual(
@@ -115,9 +113,12 @@ class ParseExampleTest(tf.test.TestCase):
     c_default = np.random.rand(2).astype(np.float32)
 
     expected_st_a = (  # indices, values, shape
-        np.empty((0, 2), dtype=np.int64),  # indices
-        np.empty((0,), dtype=np.int64),  # sp_a is DT_INT64
-        np.array([2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
+        np.empty(
+            (0, 2), dtype=np.int64),  # indices
+        np.empty(
+            (0,), dtype=np.int64),  # sp_a is DT_INT64
+        np.array(
+            [2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
 
     expected_output = {
         sparse_name: expected_st_a,
@@ -126,38 +127,63 @@ class ParseExampleTest(tf.test.TestCase):
         c_name: np.array(2 * [c_default]),
     }
 
-    self._test({
-        "example_names": np.empty((0,), dtype=bytes),
-        "serialized": tf.convert_to_tensor(["", ""]),
-        "features": {
-            sparse_name: tf.VarLenFeature(tf.int64),
-            a_name: tf.FixedLenFeature((1, 3), tf.int64, default_value=a_default),
-            b_name: tf.FixedLenFeature((3, 3), tf.string, default_value=b_default),
-            c_name: tf.FixedLenFeature((2,), tf.float32, default_value=c_default),
-        }
-    }, expected_output)
+    self._test(
+        {
+            "example_names": np.empty(
+                (0,), dtype=bytes),
+            "serialized": tf.convert_to_tensor(["", ""]),
+            "features": {
+                sparse_name: tf.VarLenFeature(tf.int64),
+                a_name: tf.FixedLenFeature(
+                    (1, 3), tf.int64, default_value=a_default),
+                b_name: tf.FixedLenFeature(
+                    (3, 3), tf.string, default_value=b_default),
+                c_name: tf.FixedLenFeature(
+                    (2,), tf.float32, default_value=c_default),
+            }
+        },
+        expected_output)
 
   def testEmptySerializedWithoutDefaultsShouldFail(self):
-    self._test({
-        "example_names": ["in1", "in2"],
-        "serialized": ["", ""],
-        "features": {
-            "st_a": tf.VarLenFeature(tf.int64),
-            "a": tf.FixedLenFeature((1, 3), tf.int64, default_value=[0, 42, 0]),
-            "b": tf.FixedLenFeature(
-                (3, 3), tf.string,
-                default_value=np.random.rand(3, 3).astype(bytes)),
-            # Feature "c" is missing a default, this gap will cause failure.
-            "c": tf.FixedLenFeature((2,), dtype=tf.float32),
-        }
-    }, expected_err=(tf.OpError, "Name: in1, Feature: c is required"))
+    input_features = {
+        "st_a": tf.VarLenFeature(tf.int64),
+        "a": tf.FixedLenFeature(
+            (1, 3), tf.int64, default_value=[0, 42, 0]),
+        "b": tf.FixedLenFeature(
+            (3, 3),
+            tf.string,
+            default_value=np.random.rand(3, 3).astype(bytes)),
+        # Feature "c" is missing a default, this gap will cause failure.
+        "c": tf.FixedLenFeature(
+            (2,), dtype=tf.float32),
+    }
+
+    # Edge case where the key is there but the feature value is empty
+    original = example(features=features({
+        "c": feature()
+    }))
+    self._test(
+        {
+            "example_names": ["in1"],
+            "serialized": [original.SerializeToString()],
+            "features": input_features,
+        },
+        expected_err=(tf.OpError, "Name: in1, Feature: c is required"))
+
+    # Standard case of missing key and value.
+    self._test(
+        {
+            "example_names": ["in1", "in2"],
+            "serialized": ["", ""],
+            "features": input_features,
+        },
+        expected_err=(tf.OpError, "Name: in1, Feature: c is required"))
 
   def testDenseNotMatchingShapeShouldFail(self):
     original = [
         example(features=features({
             "a": float_feature([1, 1, 3]),
-        })),
-        example(features=features({
+        })), example(features=features({
             "a": float_feature([-1, -1]),
         }))
     ]
@@ -165,27 +191,27 @@ class ParseExampleTest(tf.test.TestCase):
     names = ["passing", "failing"]
     serialized = [m.SerializeToString() for m in original]
 
-    self._test({
-        "example_names": names,
-        "serialized": tf.convert_to_tensor(serialized),
-        "features": {"a": tf.FixedLenFeature((1, 3), tf.float32)}
-    }, expected_err=(
-        tf.OpError, "Name: failing, Key: a, Index: 1.  Number of float val"))
+    self._test(
+        {
+            "example_names": names,
+            "serialized": tf.convert_to_tensor(serialized),
+            "features": {"a": tf.FixedLenFeature((1, 3), tf.float32)}
+        },
+        expected_err=(tf.OpError,
+                      "Name: failing, Key: a, Index: 1.  Number of float val"))
 
   def testDenseDefaultNoShapeShouldFail(self):
-    original = [
-        example(features=features({
-            "a": float_feature([1, 1, 3]),
-        })),
-    ]
+    original = [example(features=features({"a": float_feature([1, 1, 3]),})),]
 
     serialized = [m.SerializeToString() for m in original]
 
-    self._test({
-        "example_names": ["failing"],
-        "serialized": tf.convert_to_tensor(serialized),
-        "features": {"a": tf.FixedLenFeature(None, tf.float32)}
-    }, expected_err=(ValueError, "Missing shape for feature a"))
+    self._test(
+        {
+            "example_names": ["failing"],
+            "serialized": tf.convert_to_tensor(serialized),
+            "features": {"a": tf.FixedLenFeature(None, tf.float32)}
+        },
+        expected_err=(ValueError, "Missing shape for feature a"))
 
   def testSerializedContainingSparse(self):
     original = [
@@ -207,14 +233,16 @@ class ParseExampleTest(tf.test.TestCase):
     serialized = [m.SerializeToString() for m in original]
 
     expected_st_c = (  # indices, values, shape
-        np.array([[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64),
-        np.array([3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32),
-        np.array([4, 3], dtype=np.int64))  # batch == 2, max_elems = 3
+        np.array(
+            [[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64), np.array(
+                [3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32), np.array(
+                    [4, 3], dtype=np.int64))  # batch == 2, max_elems = 3
 
     expected_st_d = (  # indices, values, shape
-        np.array([[3, 0]], dtype=np.int64),
-        np.array(["hi"], dtype=bytes),
-        np.array([4, 1], dtype=np.int64))  # batch == 2, max_elems = 1
+        np.array(
+            [[3, 0]], dtype=np.int64), np.array(
+                ["hi"], dtype=bytes), np.array(
+                    [4, 1], dtype=np.int64))  # batch == 2, max_elems = 1
 
     expected_output = {
         "st_c": expected_st_c,
@@ -236,8 +264,7 @@ class ParseExampleTest(tf.test.TestCase):
         example(features=features({
             aname: float_feature([1, 1]),
             bname: bytes_feature([b"b0_str"]),
-        })),
-        example(features=features({
+        })), example(features=features({
             aname: float_feature([-1, -1]),
             bname: bytes_feature([b"b1"]),
         }))
@@ -248,24 +275,28 @@ class ParseExampleTest(tf.test.TestCase):
     expected_output = {
         aname: np.array(
             [[1, 1], [-1, -1]], dtype=np.float32).reshape(2, 1, 2, 1),
-        bname: np.array(["b0_str", "b1"], dtype=bytes).reshape(2, 1, 1, 1, 1),
+        bname: np.array(
+            ["b0_str", "b1"], dtype=bytes).reshape(2, 1, 1, 1, 1),
     }
 
     # No defaults, values required
-    self._test({
-        "serialized": tf.convert_to_tensor(serialized),
-        "features": {
-            aname: tf.FixedLenFeature((1, 2, 1), dtype=tf.float32),
-            bname: tf.FixedLenFeature((1, 1, 1, 1), dtype=tf.string),
-        }
-    }, expected_output)
+    self._test(
+        {
+            "serialized": tf.convert_to_tensor(serialized),
+            "features": {
+                aname: tf.FixedLenFeature(
+                    (1, 2, 1), dtype=tf.float32),
+                bname: tf.FixedLenFeature(
+                    (1, 1, 1, 1), dtype=tf.string),
+            }
+        },
+        expected_output)
 
   def testSerializedContainingDenseScalar(self):
     original = [
         example(features=features({
             "a": float_feature([1]),
-        })),
-        example(features=features({}))
+        })), example(features=features({}))
     ]
 
     serialized = [m.SerializeToString() for m in original]
@@ -274,12 +305,15 @@ class ParseExampleTest(tf.test.TestCase):
         "a": np.array([[1], [-1]], dtype=np.float32)  # 2x1 (column vector)
     }
 
-    self._test({
-        "serialized": tf.convert_to_tensor(serialized),
-        "features": {
-            "a": tf.FixedLenFeature((1,), dtype=tf.float32, default_value=-1),
-        }
-    }, expected_output)
+    self._test(
+        {
+            "serialized": tf.convert_to_tensor(serialized),
+            "features": {
+                "a": tf.FixedLenFeature(
+                    (1,), dtype=tf.float32, default_value=-1),
+            }
+        },
+        expected_output)
 
   def testSerializedContainingDenseWithDefaults(self):
     original = [
@@ -288,37 +322,46 @@ class ParseExampleTest(tf.test.TestCase):
         })),
         example(features=features({
             "b": bytes_feature([b"b1"]),
-        }))
+        })),
+        example(features=features({
+            "b": feature()
+        })),
     ]
 
     serialized = [m.SerializeToString() for m in original]
 
     expected_output = {
-        "a": np.array([[1, 1], [3, -3]], dtype=np.float32).reshape(2, 1, 2, 1),
-        "b": np.array(["tmp_str", "b1"], dtype=bytes).reshape(2, 1, 1, 1, 1),
+        "a": np.array(
+            [[1, 1], [3, -3], [3, -3]], dtype=np.float32).reshape(3, 1, 2, 1),
+        "b": np.array(
+            ["tmp_str", "b1", "tmp_str"], dtype=bytes).reshape(3, 1, 1, 1, 1),
     }
 
-    self._test({
-        "serialized": tf.convert_to_tensor(serialized),
-        "features": {
-            "a": tf.FixedLenFeature(
-                (1, 2, 1), dtype=tf.float32, default_value=[3.0, -3.0]),
-            "b": tf.FixedLenFeature(
-                (1, 1, 1, 1), dtype=tf.string, default_value="tmp_str"),
-        }
-    }, expected_output)
+    self._test(
+        {
+            "serialized": tf.convert_to_tensor(serialized),
+            "features": {
+                "a": tf.FixedLenFeature(
+                    (1, 2, 1), dtype=tf.float32, default_value=[3.0, -3.0]),
+                "b": tf.FixedLenFeature(
+                    (1, 1, 1, 1), dtype=tf.string, default_value="tmp_str"),
+            }
+        },
+        expected_output)
 
   def testSerializedContainingSparseAndDenseWithNoDefault(self):
     expected_st_a = (  # indices, values, shape
-        np.empty((0, 2), dtype=np.int64),  # indices
-        np.empty((0,), dtype=np.int64),  # sp_a is DT_INT64
-        np.array([2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
+        np.empty(
+            (0, 2), dtype=np.int64),  # indices
+        np.empty(
+            (0,), dtype=np.int64),  # sp_a is DT_INT64
+        np.array(
+            [2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
 
     original = [
         example(features=features({
             "c": float_feature([3, 4])
-        })),
-        example(features=features({
+        })), example(features=features({
             "c": float_feature([1, 2])
         }))
     ]
@@ -332,20 +375,25 @@ class ParseExampleTest(tf.test.TestCase):
         "st_a": expected_st_a,
         "a": np.array(2 * [[a_default]]),
         "b": np.array(2 * [b_default]),
-        "c": np.array([[3, 4], [1, 2]], dtype=np.float32),
+        "c": np.array(
+            [[3, 4], [1, 2]], dtype=np.float32),
     }
 
-    self._test({
-        "example_names": names,
-        "serialized": tf.convert_to_tensor(serialized),
-        "features": {
-            "st_a": tf.VarLenFeature(tf.int64),
-            "a": tf.FixedLenFeature((1, 3), tf.int64, default_value=a_default),
-            "b": tf.FixedLenFeature((3, 3), tf.string, default_value=b_default),
-            # Feature "c" must be provided, since it has no default_value.
-            "c": tf.FixedLenFeature((2,), tf.float32),
-        }
-    }, expected_output)
+    self._test(
+        {
+            "example_names": names,
+            "serialized": tf.convert_to_tensor(serialized),
+            "features": {
+                "st_a": tf.VarLenFeature(tf.int64),
+                "a": tf.FixedLenFeature(
+                    (1, 3), tf.int64, default_value=a_default),
+                "b": tf.FixedLenFeature(
+                    (3, 3), tf.string, default_value=b_default),
+                # Feature "c" must be provided, since it has no default_value.
+                "c": tf.FixedLenFeature((2,), tf.float32),
+            }
+        },
+        expected_output)
 
 
 class ParseSingleExampleTest(tf.test.TestCase):
@@ -353,7 +401,8 @@ class ParseSingleExampleTest(tf.test.TestCase):
   def _test(self, kwargs, expected_values=None, expected_err=None):
     with self.test_session() as sess:
       if expected_err:
-        with self.assertRaisesRegexp(expected_err[0], expected_err[1]):
+        with self.assertRaisesWithPredicateMatch(
+            expected_err[0], expected_err[1]):
           out = tf.parse_single_example(**kwargs)
           sess.run(flatten_values_tensors_or_sparse(out.values()))
       else:
@@ -374,16 +423,17 @@ class ParseSingleExampleTest(tf.test.TestCase):
           self.assertEqual(tuple(out[k].shape.get_shape().as_list()), (1,))
 
   def testSingleExampleWithSparseAndDense(self):
-    original = example(features=features(
-        {"c": float_feature([3, 4]),
-         "st_a": float_feature([3.0, 4.0])}))
+    original = example(features=features({"c": float_feature([3, 4]),
+                                          "st_a": float_feature([3.0, 4.0])}))
 
     serialized = original.SerializeToString()
 
-    expected_st_a = (
-        np.array([[0], [1]], dtype=np.int64),  # indices
-        np.array([3.0, 4.0], dtype=np.float32),  # values
-        np.array([2], dtype=np.int64))  # shape: max_values = 2
+    expected_st_a = (np.array(
+        [[0], [1]], dtype=np.int64),  # indices
+                     np.array(
+                         [3.0, 4.0], dtype=np.float32),  # values
+                     np.array(
+                         [2], dtype=np.int64))  # shape: max_values = 2
 
     a_default = [1, 2, 3]
     b_default = np.random.rand(3, 3).astype(bytes)
@@ -391,20 +441,25 @@ class ParseSingleExampleTest(tf.test.TestCase):
         "st_a": expected_st_a,
         "a": [a_default],
         "b": b_default,
-        "c": np.array([3, 4], dtype=np.float32),
+        "c": np.array(
+            [3, 4], dtype=np.float32),
     }
 
-    self._test({
-        "example_names": tf.convert_to_tensor("in1"),
-        "serialized": tf.convert_to_tensor(serialized),
-        "features": {
-            "st_a": tf.VarLenFeature(tf.float32),
-            "a": tf.FixedLenFeature((1, 3), tf.int64, default_value=a_default),
-            "b": tf.FixedLenFeature((3, 3), tf.string, default_value=b_default),
-            # Feature "c" must be provided, since it has no default_value.
-            "c": tf.FixedLenFeature((2,), tf.float32),
-        }
-    }, expected_output)
+    self._test(
+        {
+            "example_names": tf.convert_to_tensor("in1"),
+            "serialized": tf.convert_to_tensor(serialized),
+            "features": {
+                "st_a": tf.VarLenFeature(tf.float32),
+                "a": tf.FixedLenFeature(
+                    (1, 3), tf.int64, default_value=a_default),
+                "b": tf.FixedLenFeature(
+                    (3, 3), tf.string, default_value=b_default),
+                # Feature "c" must be provided, since it has no default_value.
+                "c": tf.FixedLenFeature((2,), tf.float32),
+            }
+        },
+        expected_output)
 
 
 class ParseSequenceExampleTest(tf.test.TestCase):
@@ -413,26 +468,31 @@ class ParseSequenceExampleTest(tf.test.TestCase):
     value = sequence_example(
         context=features({
             "global_feature": float_feature([1, 2, 3]),
-            }),
+        }),
         feature_lists=feature_lists({
             "repeated_feature_2_frames": feature_list([
                 bytes_feature([b"a", b"b", b"c"]),
-                bytes_feature([b"a", b"d", b"e"])]),
+                bytes_feature([b"a", b"d", b"e"])
+            ]),
             "repeated_feature_3_frames": feature_list([
-                int64_feature([3, 4, 5, 6, 7]),
-                int64_feature([-1, 0, 0, 0, 0]),
-                int64_feature([1, 2, 3, 4, 5])])
-            }))
+                int64_feature([3, 4, 5, 6, 7]), int64_feature([-1, 0, 0, 0, 0]),
+                int64_feature([1, 2, 3, 4, 5])
+            ])
+        }))
     value.SerializeToString()  # Smoke test
 
-  def _test(self, kwargs, expected_context_values=None,
-            expected_feat_list_values=None, expected_err=None):
+  def _test(self,
+            kwargs,
+            expected_context_values=None,
+            expected_feat_list_values=None,
+            expected_err=None):
     expected_context_values = expected_context_values or {}
     expected_feat_list_values = expected_feat_list_values or {}
 
     with self.test_session() as sess:
       if expected_err:
-        with self.assertRaisesRegexp(expected_err[0], expected_err[1]):
+        with self.assertRaisesWithPredicateMatch(
+            expected_err[0], expected_err[1]):
           c_out, fl_out = tf.parse_single_sequence_example(**kwargs)
           if c_out:
             sess.run(flatten_values_tensors_or_sparse(c_out.values()))
@@ -442,16 +502,16 @@ class ParseSequenceExampleTest(tf.test.TestCase):
         # Returns dicts w/ Tensors and SparseTensors.
         context_out, feat_list_out = tf.parse_single_sequence_example(**kwargs)
         context_result = sess.run(
-            flatten_values_tensors_or_sparse(
-                context_out.values())) if context_out else []
+            flatten_values_tensors_or_sparse(context_out.values(
+            ))) if context_out else []
         feat_list_result = sess.run(
-            flatten_values_tensors_or_sparse(
-                feat_list_out.values())) if feat_list_out else []
+            flatten_values_tensors_or_sparse(feat_list_out.values(
+            ))) if feat_list_out else []
         # Check values.
-        _compare_output_to_expected(
-            self, context_out, expected_context_values, context_result)
-        _compare_output_to_expected(
-            self, feat_list_out, expected_feat_list_values, feat_list_result)
+        _compare_output_to_expected(self, context_out, expected_context_values,
+                                    context_result)
+        _compare_output_to_expected(self, feat_list_out,
+                                    expected_feat_list_values, feat_list_result)
 
       # Check shapes; if serialized is a Tensor we need its size to
       # properly check.
@@ -469,16 +529,18 @@ class ParseSequenceExampleTest(tf.test.TestCase):
                 tuple(context_out[k].shape.get_shape().as_list()), (1,))
 
   def testSequenceExampleWithSparseAndDenseContext(self):
-    original = sequence_example(context=features(
-        {"c": float_feature([3, 4]),
-         "st_a": float_feature([3.0, 4.0])}))
+    original = sequence_example(context=features({"c": float_feature([3, 4]),
+                                                  "st_a": float_feature(
+                                                      [3.0, 4.0])}))
 
     serialized = original.SerializeToString()
 
-    expected_st_a = (
-        np.array([[0], [1]], dtype=np.int64),  # indices
-        np.array([3.0, 4.0], dtype=np.float32),  # values
-        np.array([2], dtype=np.int64))  # shape: num_features = 2
+    expected_st_a = (np.array(
+        [[0], [1]], dtype=np.int64),  # indices
+                     np.array(
+                         [3.0, 4.0], dtype=np.float32),  # values
+                     np.array(
+                         [2], dtype=np.int64))  # shape: num_features = 2
 
     a_default = [1, 2, 3]
     b_default = np.random.rand(3, 3).astype(bytes)
@@ -486,20 +548,25 @@ class ParseSequenceExampleTest(tf.test.TestCase):
         "st_a": expected_st_a,
         "a": [a_default],
         "b": b_default,
-        "c": np.array([3, 4], dtype=np.float32),
+        "c": np.array(
+            [3, 4], dtype=np.float32),
     }
 
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(serialized),
-        "context_features": {
-            "st_a": tf.VarLenFeature(tf.float32),
-            "a": tf.FixedLenFeature((1, 3), tf.int64, default_value=a_default),
-            "b": tf.FixedLenFeature((3, 3), tf.string, default_value=b_default),
-            # Feature "c" must be provided, since it has no default_value.
-            "c": tf.FixedLenFeature((2,), tf.float32),
-        }
-    }, expected_context_values=expected_context_output)
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(serialized),
+            "context_features": {
+                "st_a": tf.VarLenFeature(tf.float32),
+                "a": tf.FixedLenFeature(
+                    (1, 3), tf.int64, default_value=a_default),
+                "b": tf.FixedLenFeature(
+                    (3, 3), tf.string, default_value=b_default),
+                # Feature "c" must be provided, since it has no default_value.
+                "c": tf.FixedLenFeature((2,), tf.float32),
+            }
+        },
+        expected_context_values=expected_context_output)
 
   def testSequenceExampleWithMultipleSizeFeatureLists(self):
     original = sequence_example(feature_lists=feature_lists({
@@ -507,229 +574,274 @@ class ParseSequenceExampleTest(tf.test.TestCase):
             int64_feature([-1, 0, 1]),
             int64_feature([2, 3, 4]),
             int64_feature([5, 6, 7]),
-            int64_feature([8, 9, 10]),]),
+            int64_feature([8, 9, 10]),
+        ]),
         "b": feature_list([
-            bytes_feature([b"r00", b"r01", b"r10", b"r11"])]),
+            bytes_feature([b"r00", b"r01", b"r10", b"r11"])
+        ]),
         "c": feature_list([
-            float_feature([3, 4]),
-            float_feature([-1, 2])]),
-        }))
+            float_feature([3, 4]), float_feature([-1, 2])
+        ]),
+    }))
 
     serialized = original.SerializeToString()
 
     expected_feature_list_output = {
-        "a": np.array([  # outer dimension is time.
-            [[-1, 0, 1]],  # inside are 1x3 matrices
-            [[2, 3, 4]],
-            [[5, 6, 7]],
-            [[8, 9, 10]]], dtype=np.int64),
-        "b": np.array([  # outer dimension is time, inside are 2x2 matrices
-            [[b"r00", b"r01"], [b"r10", b"r11"]]], dtype=bytes),
-        "c": np.array([  # outer dimension is time, inside are 2-vectors
-            [3, 4],
-            [-1, 2]], dtype=np.float32),
-        "d": np.empty(shape=(0, 5), dtype=np.float32),  # empty_allowed_missing
-        }
+        "a": np.array(
+            [  # outer dimension is time.
+                [[-1, 0, 1]],  # inside are 1x3 matrices
+                [[2, 3, 4]],
+                [[5, 6, 7]],
+                [[8, 9, 10]]
+            ],
+            dtype=np.int64),
+        "b": np.array(
+            [  # outer dimension is time, inside are 2x2 matrices
+                [[b"r00", b"r01"], [b"r10", b"r11"]]
+            ],
+            dtype=bytes),
+        "c": np.array(
+            [  # outer dimension is time, inside are 2-vectors
+                [3, 4], [-1, 2]
+            ],
+            dtype=np.float32),
+        "d": np.empty(
+            shape=(0, 5), dtype=np.float32),  # empty_allowed_missing
+    }
 
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(serialized),
-        "sequence_features": {
-            "a": tf.FixedLenSequenceFeature((1, 3), tf.int64),
-            "b": tf.FixedLenSequenceFeature((2, 2), tf.string),
-            "c": tf.FixedLenSequenceFeature((2,), tf.float32),
-            "d": tf.FixedLenSequenceFeature((5,), tf.float32, allow_missing=True),
-        }
-    }, expected_feat_list_values=expected_feature_list_output)
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(serialized),
+            "sequence_features": {
+                "a": tf.FixedLenSequenceFeature((1, 3), tf.int64),
+                "b": tf.FixedLenSequenceFeature((2, 2), tf.string),
+                "c": tf.FixedLenSequenceFeature((2,), tf.float32),
+                "d": tf.FixedLenSequenceFeature(
+                    (5,), tf.float32, allow_missing=True),
+            }
+        },
+        expected_feat_list_values=expected_feature_list_output)
 
   def testSequenceExampleWithoutDebugName(self):
     original = sequence_example(feature_lists=feature_lists({
         "a": feature_list([
-            int64_feature([3, 4]),
-            int64_feature([1, 0])]),
+            int64_feature([3, 4]), int64_feature([1, 0])
+        ]),
         "st_a": feature_list([
-            float_feature([3.0, 4.0]),
-            float_feature([5.0]),
-            float_feature([])]),
+            float_feature([3.0, 4.0]), float_feature([5.0]), float_feature([])
+        ]),
         "st_b": feature_list([
-            bytes_feature([b"a"]),
-            bytes_feature([]),
-            bytes_feature([]),
-            bytes_feature([b"b", b"c"])])}))
+            bytes_feature([b"a"]), bytes_feature([]), bytes_feature([]),
+            bytes_feature([b"b", b"c"])
+        ])
+    }))
 
     serialized = original.SerializeToString()
 
     expected_st_a = (
-        np.array([[0, 0], [0, 1], [1, 0]], dtype=np.int64),  # indices
-        np.array([3.0, 4.0, 5.0], dtype=np.float32),  # values
-        np.array([3, 2], dtype=np.int64))  # shape: num_time = 3, max_feat = 2
+        np.array(
+            [[0, 0], [0, 1], [1, 0]], dtype=np.int64),  # indices
+        np.array(
+            [3.0, 4.0, 5.0], dtype=np.float32),  # values
+        np.array(
+            [3, 2], dtype=np.int64))  # shape: num_time = 3, max_feat = 2
 
     expected_st_b = (
-        np.array([[0, 0], [3, 0], [3, 1]], dtype=np.int64),  # indices
-        np.array(["a", "b", "c"], dtype="|S"),  # values
-        np.array([4, 2], dtype=np.int64))  # shape: num_time = 4, max_feat = 2
+        np.array(
+            [[0, 0], [3, 0], [3, 1]], dtype=np.int64),  # indices
+        np.array(
+            ["a", "b", "c"], dtype="|S"),  # values
+        np.array(
+            [4, 2], dtype=np.int64))  # shape: num_time = 4, max_feat = 2
 
     expected_st_c = (
-        np.empty((0, 2), dtype=np.int64),  # indices
-        np.empty((0,), dtype=np.int64),  # values
-        np.array([0, 0], dtype=np.int64))  # shape: num_time = 0, max_feat = 0
+        np.empty(
+            (0, 2), dtype=np.int64),  # indices
+        np.empty(
+            (0,), dtype=np.int64),  # values
+        np.array(
+            [0, 0], dtype=np.int64))  # shape: num_time = 0, max_feat = 0
 
     expected_feature_list_output = {
-        "a": np.array([[3, 4], [1, 0]], dtype=np.int64),
+        "a": np.array(
+            [[3, 4], [1, 0]], dtype=np.int64),
         "st_a": expected_st_a,
         "st_b": expected_st_b,
         "st_c": expected_st_c,
     }
 
-    self._test({
-        "serialized": tf.convert_to_tensor(serialized),
-        "sequence_features": {
-            "st_a": tf.VarLenFeature(tf.float32),
-            "st_b": tf.VarLenFeature(tf.string),
-            "st_c": tf.VarLenFeature(tf.int64),
-            "a": tf.FixedLenSequenceFeature((2,), tf.int64),
-        }
-    }, expected_feat_list_values=expected_feature_list_output)
+    self._test(
+        {
+            "serialized": tf.convert_to_tensor(serialized),
+            "sequence_features": {
+                "st_a": tf.VarLenFeature(tf.float32),
+                "st_b": tf.VarLenFeature(tf.string),
+                "st_c": tf.VarLenFeature(tf.int64),
+                "a": tf.FixedLenSequenceFeature((2,), tf.int64),
+            }
+        },
+        expected_feat_list_values=expected_feature_list_output)
 
   def testSequenceExampleWithSparseAndDenseFeatureLists(self):
     original = sequence_example(feature_lists=feature_lists({
         "a": feature_list([
-            int64_feature([3, 4]),
-            int64_feature([1, 0])]),
+            int64_feature([3, 4]), int64_feature([1, 0])
+        ]),
         "st_a": feature_list([
-            float_feature([3.0, 4.0]),
-            float_feature([5.0]),
-            float_feature([])]),
+            float_feature([3.0, 4.0]), float_feature([5.0]), float_feature([])
+        ]),
         "st_b": feature_list([
-            bytes_feature([b"a"]),
-            bytes_feature([]),
-            bytes_feature([]),
-            bytes_feature([b"b", b"c"])])}))
+            bytes_feature([b"a"]), bytes_feature([]), bytes_feature([]),
+            bytes_feature([b"b", b"c"])
+        ])
+    }))
 
     serialized = original.SerializeToString()
 
     expected_st_a = (
-        np.array([[0, 0], [0, 1], [1, 0]], dtype=np.int64),  # indices
-        np.array([3.0, 4.0, 5.0], dtype=np.float32),  # values
-        np.array([3, 2], dtype=np.int64))  # shape: num_time = 3, max_feat = 2
+        np.array(
+            [[0, 0], [0, 1], [1, 0]], dtype=np.int64),  # indices
+        np.array(
+            [3.0, 4.0, 5.0], dtype=np.float32),  # values
+        np.array(
+            [3, 2], dtype=np.int64))  # shape: num_time = 3, max_feat = 2
 
     expected_st_b = (
-        np.array([[0, 0], [3, 0], [3, 1]], dtype=np.int64),  # indices
-        np.array(["a", "b", "c"], dtype="|S"),  # values
-        np.array([4, 2], dtype=np.int64))  # shape: num_time = 4, max_feat = 2
+        np.array(
+            [[0, 0], [3, 0], [3, 1]], dtype=np.int64),  # indices
+        np.array(
+            ["a", "b", "c"], dtype="|S"),  # values
+        np.array(
+            [4, 2], dtype=np.int64))  # shape: num_time = 4, max_feat = 2
 
     expected_st_c = (
-        np.empty((0, 2), dtype=np.int64),  # indices
-        np.empty((0,), dtype=np.int64),  # values
-        np.array([0, 0], dtype=np.int64))  # shape: num_time = 0, max_feat = 0
+        np.empty(
+            (0, 2), dtype=np.int64),  # indices
+        np.empty(
+            (0,), dtype=np.int64),  # values
+        np.array(
+            [0, 0], dtype=np.int64))  # shape: num_time = 0, max_feat = 0
 
     expected_feature_list_output = {
-        "a": np.array([[3, 4], [1, 0]], dtype=np.int64),
+        "a": np.array(
+            [[3, 4], [1, 0]], dtype=np.int64),
         "st_a": expected_st_a,
         "st_b": expected_st_b,
         "st_c": expected_st_c,
     }
 
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(serialized),
-        "sequence_features": {
-            "st_a": tf.VarLenFeature(tf.float32),
-            "st_b": tf.VarLenFeature(tf.string),
-            "st_c": tf.VarLenFeature(tf.int64),
-            "a": tf.FixedLenSequenceFeature((2,), tf.int64),
-        }
-    }, expected_feat_list_values=expected_feature_list_output)
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(serialized),
+            "sequence_features": {
+                "st_a": tf.VarLenFeature(tf.float32),
+                "st_b": tf.VarLenFeature(tf.string),
+                "st_c": tf.VarLenFeature(tf.int64),
+                "a": tf.FixedLenSequenceFeature((2,), tf.int64),
+            }
+        },
+        expected_feat_list_values=expected_feature_list_output)
 
   def testSequenceExampleListWithInconsistentDataFails(self):
     original = sequence_example(feature_lists=feature_lists({
         "a": feature_list([
-            int64_feature([-1, 0]),
-            float_feature([2, 3])])
-        }))
+            int64_feature([-1, 0]), float_feature([2, 3])
+        ])
+    }))
 
     serialized = original.SerializeToString()
 
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(serialized),
-        "sequence_features": {"a": tf.FixedLenSequenceFeature((2,), tf.int64)}
-    }, expected_err=(
-        tf.OpError,
-        "Feature list: a, Index: 1."
-        "  Data types don't match. Expected type: int64"))
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(serialized),
+            "sequence_features": {"a": tf.FixedLenSequenceFeature(
+                (2,), tf.int64)}
+        },
+        expected_err=(tf.OpError, "Feature list: a, Index: 1."
+                      "  Data types don't match. Expected type: int64"))
 
   def testSequenceExampleListWithWrongDataTypeFails(self):
     original = sequence_example(feature_lists=feature_lists({
         "a": feature_list([
-            float_feature([2, 3])])
-        }))
+            float_feature([2, 3])
+        ])
+    }))
 
     serialized = original.SerializeToString()
 
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(serialized),
-        "sequence_features": {"a": tf.FixedLenSequenceFeature((2,), tf.int64)}
-    }, expected_err=(
-        tf.OpError,
-        "Feature list: a, Index: 0.  Data types don't match."
-        " Expected type: int64"))
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(serialized),
+            "sequence_features": {"a": tf.FixedLenSequenceFeature(
+                (2,), tf.int64)}
+        },
+        expected_err=(tf.OpError,
+                      "Feature list: a, Index: 0.  Data types don't match."
+                      " Expected type: int64"))
 
   def testSequenceExampleListWithWrongSparseDataTypeFails(self):
     original = sequence_example(feature_lists=feature_lists({
         "a": feature_list([
-            int64_feature([3, 4]),
-            int64_feature([1, 2]),
-            float_feature([2.0, 3.0])])
-        }))
+            int64_feature([3, 4]), int64_feature([1, 2]),
+            float_feature([2.0, 3.0])
+        ])
+    }))
 
     serialized = original.SerializeToString()
 
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(serialized),
-        "sequence_features": {"a": tf.FixedLenSequenceFeature((2,), tf.int64)}
-    }, expected_err=(
-        tf.OpError,
-        "Name: in1, Feature list: a, Index: 2."
-        "  Data types don't match. Expected type: int64"
-        "  Feature is: float_list"))
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(serialized),
+            "sequence_features": {"a": tf.FixedLenSequenceFeature(
+                (2,), tf.int64)}
+        },
+        expected_err=(tf.OpError, "Name: in1, Feature list: a, Index: 2."
+                      "  Data types don't match. Expected type: int64"
+                      "  Feature is: float_list"))
 
   def testSequenceExampleListWithWrongShapeFails(self):
     original = sequence_example(feature_lists=feature_lists({
         "a": feature_list([
-            int64_feature([2, 3]),
-            int64_feature([2, 3, 4])]),
-        }))
+            int64_feature([2, 3]), int64_feature([2, 3, 4])
+        ]),
+    }))
 
     serialized = original.SerializeToString()
 
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(serialized),
-        "sequence_features": {"a": tf.FixedLenSequenceFeature((2,), tf.int64)}
-    }, expected_err=(
-        tf.OpError,
-        r"Name: in1, Key: a, Index: 1."
-        r"  Number of int64 values != expected."
-        r"  values size: 3 but output shape: \[2\]"))
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(serialized),
+            "sequence_features": {"a": tf.FixedLenSequenceFeature(
+                (2,), tf.int64)}
+        },
+        expected_err=(tf.OpError, r"Name: in1, Key: a, Index: 1."
+                      r"  Number of int64 values != expected."
+                      r"  values size: 3 but output shape: \[2\]"))
 
   def testSequenceExampleWithMissingFeatureListFails(self):
     original = sequence_example(feature_lists=feature_lists({}))
 
     # Test fails because we didn't add:
     #  feature_list_dense_defaults = {"a": None}
-    self._test({
-        "example_name": "in1",
-        "serialized": tf.convert_to_tensor(original.SerializeToString()),
-        "sequence_features": {"a": tf.FixedLenSequenceFeature((2,), tf.int64)}
-    }, expected_err=(
-        tf.OpError,
-        "Name: in1, Feature list 'a' is required but could not be found."
-        "  Did you mean to include it in"
-        " feature_list_dense_missing_assumed_empty or"
-        " feature_list_dense_defaults?"))
+    self._test(
+        {
+            "example_name": "in1",
+            "serialized": tf.convert_to_tensor(original.SerializeToString()),
+            "sequence_features": {"a": tf.FixedLenSequenceFeature(
+                (2,), tf.int64)}
+        },
+        expected_err=(
+            tf.OpError,
+            "Name: in1, Feature list 'a' is required but could not be found."
+            "  Did you mean to include it in"
+            " feature_list_dense_missing_assumed_empty or"
+            " feature_list_dense_defaults?"))
 
 
 class DecodeJSONExampleTest(tf.test.TestCase):
@@ -740,14 +852,15 @@ class DecodeJSONExampleTest(tf.test.TestCase):
 
       json_tensor = tf.constant(
           [json_format.MessageToJson(m) for m in examples.flatten()],
-          shape=examples.shape, dtype=tf.string)
+          shape=examples.shape,
+          dtype=tf.string)
       binary_tensor = tf.decode_json_example(json_tensor)
       binary_val = sess.run(binary_tensor)
 
       if examples.shape:
         self.assertShapeEqual(binary_val, json_tensor)
-        for input_example, output_binary in zip(np.array(examples).flatten(),
-                                                binary_val.flatten()):
+        for input_example, output_binary in zip(
+            np.array(examples).flatten(), binary_val.flatten()):
           output_example = tf.train.Example()
           output_example.ParseFromString(output_binary)
           self.assertProtoEquals(input_example, output_example)
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 281c09adb9e..1edfdf47a87 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -236,6 +236,40 @@ class RNNTest(tf.test.TestCase):
               1.0 * (1 + 1) * np.ones((input_size)),
               1.0 * (2 + 1) * np.ones((input_size)))))
 
+  def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
+    with self.test_session(use_gpu=True, graph=tf.Graph()):
+      if use_outer_scope:
+        with tf.variable_scope(prefix) as scope:
+          factory(scope)
+      else:
+        factory(prefix)
+
+      # check that all the variables names starts
+      # with the proper scope.
+      tf.initialize_all_variables()
+      all_vars = tf.all_variables()
+      prefix = prefix or "RNN"
+      scope_vars = [v for v in all_vars if v.name.startswith(prefix)]
+      tf.logging.info("RNN with scope: %s (%s)"
+                      % (prefix, "scope" if use_outer_scope else "str"))
+      for v in scope_vars:
+        tf.logging.info(v.name)
+      self.assertEqual(len(scope_vars), len(all_vars))
+
+  def testScope(self):
+    def factory(scope):
+      cell = Plus1RNNCell()
+      batch_size = 2
+      input_size = 5
+      max_length = 8  # unrolled up to this length
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
+      return tf.nn.rnn(cell, inputs, dtype=tf.float32, scope=scope)
+
+    self._testScope(factory, use_outer_scope=True)
+    self._testScope(factory, use_outer_scope=False)
+    self._testScope(factory, prefix=None, use_outer_scope=False)
+
 
 class GRUTest(tf.test.TestCase):
 
@@ -275,6 +309,46 @@ class GRUTest(tf.test.TestCase):
     self._testDynamic(use_gpu=False)
     self._testDynamic(use_gpu=True)
 
+  def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
+    with self.test_session(use_gpu=True, graph=tf.Graph()):
+      if use_outer_scope:
+        with tf.variable_scope(prefix) as scope:
+          factory(scope)
+      else:
+        factory(prefix)
+        tf.initialize_all_variables()
+
+      # check that all the variables names starts
+      # with the proper scope.
+      all_vars = tf.all_variables()
+      prefix = prefix or "RNN"
+      scope_vars = [v for v in all_vars if v.name.startswith(prefix)]
+      tf.logging.info("RNN with scope: %s (%s)"
+                      % (prefix, "scope" if use_outer_scope else "str"))
+      for v in scope_vars:
+        tf.logging.info(v.name)
+      self.assertEqual(len(scope_vars), len(all_vars))
+
+  def testDynamicScope(self):
+    time_steps = 8
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    sequence_length = np.random.randint(0, time_steps, size=batch_size)
+
+    def factory(scope):
+      concat_inputs = tf.placeholder(
+          tf.float32, shape=(time_steps, batch_size, input_size))
+      cell = tf.nn.rnn_cell.GRUCell(num_units=num_units)
+      return tf.nn.dynamic_rnn(cell, inputs=concat_inputs,
+                               sequence_length=sequence_length,
+                               time_major=True, dtype=tf.float32,
+                               scope=scope)
+
+    self._testScope(factory, use_outer_scope=True)
+    self._testScope(factory, use_outer_scope=False)
+    self._testScope(factory, prefix=None, use_outer_scope=False)
+
 
 class LSTMTest(tf.test.TestCase):
 
@@ -1053,7 +1127,11 @@ class BidirectionalRNNTest(tf.test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
-  def _createBidirectionalRNN(self, use_gpu, use_shape, use_sequence_length):
+  def _createBidirectionalRNN(self,
+                              use_gpu,
+                              use_shape,
+                              use_sequence_length,
+                              scope=None):
     num_units = 3
     input_size = 5
     batch_size = 2
@@ -1077,7 +1155,8 @@ class BidirectionalRNNTest(tf.test.TestCase):
         cell_bw,
         inputs,
         dtype=tf.float32,
-        sequence_length=sequence_length)
+        sequence_length=sequence_length,
+        scope=scope)
     self.assertEqual(len(outputs), len(inputs))
     for out in outputs:
       self.assertEqual(
@@ -1179,7 +1258,8 @@ class BidirectionalRNNTest(tf.test.TestCase):
                                                     use_shape=True)
 
   def _createBidirectionalDynamicRNN(self, use_gpu, use_shape,
-                                     use_state_tuple, use_time_major):
+                                     use_state_tuple, use_time_major,
+                                     scope=None):
     num_units = 3
     input_size = 5
     batch_size = 2
@@ -1205,7 +1285,8 @@ class BidirectionalRNNTest(tf.test.TestCase):
         inputs_c,
         sequence_length,
         dtype=tf.float32,
-        time_major=use_time_major)
+        time_major=use_time_major,
+        scope=scope)
     outputs = tf.concat(2, outputs)
     state_fw, state_bw = states
     outputs_shape = [None, max_length, 2 * num_units]
@@ -1286,6 +1367,54 @@ class BidirectionalRNNTest(tf.test.TestCase):
                                        use_state_tuple=option[2],
                                        use_time_major=option[3])
 
+  def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
+    # REMARKS: factory(scope) is a function accepting a scope
+    #          as an argument, such scope can be None, a string
+    #          or a VariableScope instance.
+    with self.test_session(use_gpu=True, graph=tf.Graph()):
+      if use_outer_scope:
+        with tf.variable_scope(prefix) as scope:
+          factory(scope)
+      else:
+        factory(prefix)
+
+      # check that all the variables names starts
+      # with the proper scope.
+      tf.initialize_all_variables()
+      all_vars = tf.all_variables()
+      prefix = prefix or "BiRNN"
+      scope_vars = [v for v in all_vars if v.name.startswith(prefix)]
+      tf.logging.info("BiRNN with scope: %s (%s)"
+                      % (prefix, "scope" if use_outer_scope else "str"))
+      for v in scope_vars:
+        tf.logging.info(v.name)
+      self.assertEqual(len(scope_vars), len(all_vars))
+
+  def testBidirectionalRNNScope(self):
+    def factory(scope):
+      return self._createBidirectionalRNN(
+          use_gpu=True, use_shape=True,
+          use_sequence_length=True, scope=scope)
+
+    self._testScope(factory, use_outer_scope=True)
+    self._testScope(factory, use_outer_scope=False)
+    self._testScope(factory, prefix=None, use_outer_scope=False)
+
+  def testBidirectionalDynamicRNNScope(self):
+    def get_factory(use_time_major):
+      def factory(scope):
+        return self._createBidirectionalDynamicRNN(
+            use_gpu=True, use_shape=True, use_state_tuple=True,
+            use_time_major=use_time_major, scope=scope)
+      return factory
+
+    self._testScope(get_factory(True), use_outer_scope=True)
+    self._testScope(get_factory(True), use_outer_scope=False)
+    self._testScope(get_factory(True), prefix=None, use_outer_scope=False)
+    self._testScope(get_factory(False), use_outer_scope=True)
+    self._testScope(get_factory(False), use_outer_scope=False)
+    self._testScope(get_factory(False), prefix=None, use_outer_scope=False)
+
 
 class MultiDimensionalLSTMTest(tf.test.TestCase):
 
@@ -1672,6 +1801,104 @@ class RawRNNTest(tf.test.TestCase):
       self.assertAllEqual(
           np.ones((max_time, batch_size, 1), np.int64), output_vals[1])
 
+  def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
+    with self.test_session(use_gpu=True, graph=tf.Graph()):
+      if use_outer_scope:
+        with tf.variable_scope(prefix) as scope:
+          factory(scope)
+      else:
+        factory(prefix)
+        tf.initialize_all_variables()
+
+      # check that all the variables names starts
+      # with the proper scope.
+      all_vars = tf.all_variables()
+      prefix = prefix or "RNN"
+      scope_vars = [v for v in all_vars if v.name.startswith(prefix)]
+      tf.logging.info("RNN with scope: %s (%s)"
+                      % (prefix, "scope" if use_outer_scope else "str"))
+      for v in scope_vars:
+        tf.logging.info(v.name)
+      self.assertEqual(len(scope_vars), len(all_vars))
+
+  def testRawRNNScope(self):
+    max_time = 10
+    batch_size = 16
+    input_depth = 4
+    num_units = 3
+
+    def factory(scope):
+      inputs = tf.placeholder(shape=(max_time, batch_size, input_depth),
+                              dtype=tf.float32)
+      sequence_length = tf.placeholder(shape=(batch_size,), dtype=tf.int32)
+      inputs_ta = tf.TensorArray(dtype=tf.float32, size=tf.shape(inputs)[0])
+      inputs_ta = inputs_ta.unpack(inputs)
+
+      def loop_fn(time_, cell_output, unused_loop_state):
+        emit_output = cell_output  # == None for time == 0
+        elements_finished = (time_ >= sequence_length)
+        finished = tf.reduce_all(elements_finished)
+        # For the very final iteration, we must emit a dummy input
+        next_input = tf.cond(
+            finished,
+            lambda: tf.zeros([batch_size, input_depth], dtype=tf.float32),
+            lambda: inputs_ta.read(time_))
+        return (elements_finished, next_input, emit_output, None)
+
+      cell = tf.nn.rnn_cell.LSTMCell(num_units, state_is_tuple=True)
+      initial_state = cell.zero_state(batch_size, tf.float32)
+      return tf.nn.raw_rnn(cell, loop_fn, initial_state, scope=scope)
+
+    self._testScope(factory, use_outer_scope=True)
+    self._testScope(factory, use_outer_scope=False)
+    self._testScope(factory, prefix=None, use_outer_scope=False)
+
+
+class StateSaverRNNTest(tf.test.TestCase):
+
+  def setUp(self):
+    self._seed = 23489
+    np.random.seed(self._seed)
+
+  def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
+    with self.test_session(use_gpu=True, graph=tf.Graph()):
+      if use_outer_scope:
+        with tf.variable_scope(prefix) as scope:
+          factory(scope)
+      else:
+        factory(prefix)
+        tf.initialize_all_variables()
+
+      # check that all the variables names starts
+      # with the proper scope.
+      all_vars = tf.all_variables()
+      prefix = prefix or "RNN"
+      scope_vars = [v for v in all_vars if v.name.startswith(prefix)]
+      tf.logging.info("RNN with scope: %s (%s)"
+                      % (prefix, "scope" if use_outer_scope else "str"))
+      for v in scope_vars:
+        tf.logging.info(v.name)
+      self.assertEqual(len(scope_vars), len(all_vars))
+
+  def testStateSaverRNNScope(self):
+    num_units = 3
+    input_size = 5
+    batch_size = 2
+    max_length = 8
+    def factory(scope):
+      initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed)
+      state_saver = TestStateSaver(batch_size, 2 * num_units)
+      cell = tf.nn.rnn_cell.LSTMCell(
+          num_units, use_peepholes=False, initializer=initializer)
+      inputs = max_length * [
+          tf.placeholder(tf.float32, shape=(batch_size, input_size))]
+      return tf.nn.state_saving_rnn(
+          cell, inputs, state_saver=state_saver,
+          state_name="save_lstm", scope=scope)
+
+    self._testScope(factory, use_outer_scope=True)
+    self._testScope(factory, use_outer_scope=False)
+    self._testScope(factory, prefix=None, use_outer_scope=False)
 
 ######### Benchmarking RNN code
 
diff --git a/tensorflow/python/kernel_tests/scan_ops_test.py b/tensorflow/python/kernel_tests/scan_ops_test.py
index 1197b49a5fd..4db5cf51c4e 100644
--- a/tensorflow/python/kernel_tests/scan_ops_test.py
+++ b/tensorflow/python/kernel_tests/scan_ops_test.py
@@ -59,9 +59,9 @@ class CumsumTest(tf.test.TestCase):
   valid_dtypes = [np.int32, np.int64, np.float16, np.float32,
                   np.float64, np.complex64, np.complex128]
 
-  def _compare(self, x, axis, exclusive, reverse, use_gpu=False):
+  def _compare(self, x, axis, exclusive, reverse):
     np_out = handle_options(np.cumsum, x, axis, exclusive, reverse)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session():
       tf_out = tf.cumsum(x, axis, exclusive, reverse).eval()
 
     self.assertAllClose(np_out, tf_out)
@@ -69,8 +69,7 @@ class CumsumTest(tf.test.TestCase):
   def _compareAll(self, x, axis):
     for exclusive in [True, False]:
       for reverse in [True, False]:
-        for use_gpu in [True, False]:
-          self._compare(x, axis, exclusive, reverse, use_gpu)
+        self._compare(x, axis, exclusive, reverse)
 
   def test1D(self):
     for dtype in self.valid_dtypes:
@@ -144,9 +143,9 @@ class CumprodTest(tf.test.TestCase):
   valid_dtypes = [np.int32, np.int64, np.float16, np.float32,
                   np.float64, np.complex64, np.complex128]
 
-  def _compare(self, x, axis, exclusive, reverse, use_gpu=False):
+  def _compare(self, x, axis, exclusive, reverse):
     np_out = handle_options(np.cumprod, x, axis, exclusive, reverse)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session():
       tf_out = tf.cumprod(x, axis, exclusive, reverse).eval()
 
     self.assertAllClose(np_out, tf_out)
@@ -154,8 +153,7 @@ class CumprodTest(tf.test.TestCase):
   def _compareAll(self, x, axis):
     for exclusive in [True, False]:
       for reverse in [True, False]:
-        for use_gpu in [True, False]:
-          self._compare(x, axis, exclusive, reverse, use_gpu)
+        self._compare(x, axis, exclusive, reverse)
 
 
   def test1D(self):
diff --git a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
index 714b86fbfc7..3d08c2afbbe 100644
--- a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
+++ b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,106 +12,118 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-"""Tests for tensorflow.ops.tf.self_adjoint_eig."""
+"""Tests for tensorflow.ops.math_ops.matrix_inverse."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
 
-class SelfAdjointEigOpTest(tf.test.TestCase):
-
-  def _testEigs(self, x, d, tf_ans, use_gpu=False):
-    np_eig_val, np_eig_vec = np.linalg.eig(x)
-
-    # First check the eigenvalues
-    self.assertAllClose(sorted(np_eig_val), sorted(tf_ans[0, :]))
-
-    # need to make things canonical. This test may still fail in case there are
-    # two equal eigenvalues, so that there is indeterminacy in the eigenvectors.
-    # For now, assume that we will only test matrices with distinct eigenvalues.
-    np_arg = np.argsort(np_eig_val)
-    tf_arg = np.argsort(tf_ans[0, :])
-
-    np_eig_vecs_sorted = np.array([np_eig_vec[:, i] for i in np_arg]).T
-    tf_eig_vecs_sorted = np.array([tf_ans[1:, i] for i in tf_arg]).T
-    np_eig_vecs_signed_sorted = np.array([np_eig_vecs_sorted[:, i] *
-                                          np.sign(np_eig_vecs_sorted[0, i])
-                                          for i in xrange(d)]).T
-    tf_eig_vecs_signed_sorted = np.array([tf_eig_vecs_sorted[:, i] *
-                                          np.sign(tf_eig_vecs_sorted[0, i])
-                                          for i in xrange(d)]).T
-    self.assertAllClose(np_eig_vecs_signed_sorted, tf_eig_vecs_signed_sorted)
-
-  def _compareSelfAdjointEig(self, x, use_gpu=False):
-    with self.test_session() as sess:
-      tf_eig = tf.self_adjoint_eig(tf.constant(x))
-      tf_eig_out = sess.run([tf_eig])[0]
-
-    d, _ = x.shape
-    self.assertEqual([d+1, d], tf_eig.get_shape().dims)
-    self._testEigs(x, d, tf_eig_out, use_gpu)
-
-  def _compareBatchSelfAdjointEigRank3(self, x, use_gpu=False):
-    with self.test_session() as sess:
-      tf_eig = tf.batch_self_adjoint_eig(tf.constant(x))
-      tf_out = sess.run([tf_eig])[0]
-    dlist = x.shape
-    d = dlist[-2]
-
-    self.assertEqual([d+1, d], tf_eig.get_shape().dims[-2:])
-    # not testing the values.
-    self.assertEqual(dlist[0], tf_eig.get_shape().dims[0])
-
-    for i in xrange(dlist[0]):
-      self._testEigs(x[i], d, tf_out[i])
-
-  def _compareBatchSelfAdjointEigRank2(self, x, use_gpu=False):
-    with self.test_session() as sess:
-      tf_eig = tf.batch_self_adjoint_eig(tf.constant(x))
-      tf_out = sess.run([tf_eig])[0]
-    dlist = x.shape
-    d = dlist[-2]
-
-    self.assertEqual(len(tf_eig.get_shape()), 2)
-    self.assertEqual([d+1, d], tf_eig.get_shape().dims[-2:])
-    self._testEigs(x, d, tf_out)
-
-  def testBasic(self):
-    self._compareSelfAdjointEig(
-        np.array([[3., 0., 1.], [0., 2., -2.], [1., -2., 3.]]))
-
-  def testBatch(self):
-    simple_array = np.array([[[1., 0.], [0., 5.]]])  # shape (1, 2, 2)
-    simple_array_2d = simple_array[0]  # shape (2, 2)
-    self._compareBatchSelfAdjointEigRank3(simple_array)
-    self._compareBatchSelfAdjointEigRank3(
-        np.vstack((simple_array, simple_array)))
-    self._compareBatchSelfAdjointEigRank2(simple_array_2d)
-    odd_sized_array = np.array([[[3., 0., 1.], [0., 2., -2.], [1., -2., 3.]]])
-    self._compareBatchSelfAdjointEigRank3(
-        np.vstack((odd_sized_array, odd_sized_array)))
-
-    # Generate random positive-definite matrices.
-    matrices = np.random.rand(10, 5, 5)
-    for i in xrange(10):
-      matrices[i] = np.dot(matrices[i].T, matrices[i])
-    self._compareBatchSelfAdjointEigRank3(matrices)
-
-  def testNonSquareMatrix(self):
-    with self.assertRaises(ValueError):
-      tf.self_adjoint_eig(tf.constant(np.array([[1., 2., 3.], [3., 4., 5.]])))
+class SelfAdjointEigTest(tf.test.TestCase):
 
   def testWrongDimensions(self):
-    tensor3 = tf.constant([1., 2.])
+    # The input to self_adjoint_eig should be 2-dimensional tensor.
+    scalar = tf.constant(1.)
     with self.assertRaises(ValueError):
-      tf.self_adjoint_eig(tensor3)
+      tf.self_adjoint_eig(scalar)
+    vector = tf.constant([1., 2.])
+    with self.assertRaises(ValueError):
+      tf.self_adjoint_eig(vector)
+    tensor = tf.constant([[[1., 2.], [3., 4.]], [[1., 2.], [3., 4.]]])
+    with self.assertRaises(ValueError):
+      tf.self_adjoint_eig(tensor)
+
+    # The input to batch_batch_self_adjoint_eig should be a tensor of
+    # at least rank 2.
+    scalar = tf.constant(1.)
+    with self.assertRaises(ValueError):
+      tf.batch_self_adjoint_eig(scalar)
+    vector = tf.constant([1., 2.])
+    with self.assertRaises(ValueError):
+      tf.batch_self_adjoint_eig(vector)
 
 
-if __name__ == "__main__":
+def SortEigenDecomposition(e, v):
+  if v.ndim < 2:
+    return e, v
+  else:
+    perm = np.argsort(e, -1)
+    return np.take(e, perm, -1), np.take(v, perm, -1)
+
+
+def _GetSelfAdjointEigTest(dtype_, shape_):
+
+  def CompareEigenVectors(self, x, y, atol):
+    # Eigenvectors are only unique up to sign so we normalize the signs first.
+    signs = np.sign(np.sum(np.divide(x, y), -2, keepdims=True))
+    x *= signs
+    self.assertAllClose(x, y, atol)
+
+  def CompareEigenDecompositions(self, x_e, x_v, y_e, y_v, atol):
+    num_batches = int(np.prod(x_e.shape[:-1]))
+    n = x_e.shape[-1]
+    x_e = np.reshape(x_e, [num_batches] + [n])
+    x_v = np.reshape(x_v, [num_batches] + [n, n])
+    y_e = np.reshape(y_e, [num_batches] + [n])
+    y_v = np.reshape(y_v, [num_batches] + [n, n])
+    for i in range(num_batches):
+      x_ei, x_vi = SortEigenDecomposition(x_e[i, :], x_v[i, :, :])
+      y_ei, y_vi = SortEigenDecomposition(y_e[i, :], y_v[i, :, :])
+      self.assertAllClose(x_ei, y_ei, atol=atol)
+      CompareEigenVectors(self, x_vi, y_vi, atol)
+
+  def Test(self):
+    np.random.seed(1)
+    n = shape_[-1]
+    batch_shape = shape_[:-2]
+    a = np.random.uniform(
+        low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(dtype_)
+    a += a.T
+    a = np.tile(a, batch_shape + (1, 1))
+    if dtype_ == np.float32:
+      atol = 1e-4
+    else:
+      atol = 1e-14
+    for compute_v in False, True:
+      np_e, np_v = np.linalg.eig(a)
+      with self.test_session():
+        if compute_v:
+          if a.ndim == 2:
+            op = tf.self_adjoint_eig
+          else:
+            op = tf.batch_self_adjoint_eig
+          tf_e, tf_v = op(tf.constant(a))
+
+          # Check that V*diag(E)*V^T is close to A.
+          a_ev = tf.batch_matmul(
+              tf.batch_matmul(tf_v, tf.batch_matrix_diag(tf_e)),
+              tf_v,
+              adj_y=True)
+          self.assertAllClose(a_ev.eval(), a, atol=atol)
+
+          # Compare to numpy.linalg.eig.
+          CompareEigenDecompositions(self, np_e, np_v, tf_e.eval(), tf_v.eval(),
+                                     atol)
+        else:
+          if a.ndim == 2:
+            op = tf.self_adjoint_eigvals
+          else:
+            op = tf.batch_self_adjoint_eigvals
+          tf_e = op(tf.constant(a))
+          self.assertAllClose(
+              np.sort(np_e, -1), np.sort(tf_e.eval(), -1), atol=atol)
+
+  return Test
+
+
+if __name__ == '__main__':
+  for dtype in np.float32, np.float64:
+    for size in 1, 2, 5, 10:
+      for batch_dims in [(), (3,)] + [(3, 2)] * (max(size, size) < 10):
+        shape = batch_dims + (size, size)
+        name = '%s_%s' % (dtype.__name__, '_'.join(map(str, shape)))
+        setattr(SelfAdjointEigTest, 'testSelfAdjointEig_' + name,
+                _GetSelfAdjointEigTest(dtype, shape))
   tf.test.main()
diff --git a/tensorflow/python/kernel_tests/seq2seq_test.py b/tensorflow/python/kernel_tests/seq2seq_test.py
index 58af5c42bd8..c9a8203b5d9 100644
--- a/tensorflow/python/kernel_tests/seq2seq_test.py
+++ b/tensorflow/python/kernel_tests/seq2seq_test.py
@@ -263,6 +263,32 @@ class Seq2SeqTest(tf.test.TestCase):
         res = sess.run([mem])
         self.assertEqual((2, 2), res[0].shape)
 
+  def testAttentionDecoderStateIsTuple(self):
+    with self.test_session() as sess:
+      with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
+        cell = tf.nn.rnn_cell.BasicLSTMCell(2, state_is_tuple=True)
+        cell = tf.nn.rnn_cell.MultiRNNCell(cells=[cell] * 2,
+                                           state_is_tuple=True)
+        inp = [tf.constant(0.5, shape=[2, 2])] * 2
+        enc_outputs, enc_state = tf.nn.rnn(cell, inp, dtype=tf.float32)
+        attn_states = tf.concat(1, [tf.reshape(e, [-1, 1, cell.output_size])
+                                    for e in enc_outputs])
+        dec_inp = [tf.constant(0.4, shape=[2, 2])] * 3
+        dec, mem = tf.nn.seq2seq.attention_decoder(
+            dec_inp, enc_state,
+            attn_states, cell, output_size=4)
+        sess.run([tf.initialize_all_variables()])
+        res = sess.run(dec)
+        self.assertEqual(3, len(res))
+        self.assertEqual((2, 4), res[0].shape)
+
+        res = sess.run([mem])
+        self.assertEqual(2, len(res[0]))
+        self.assertEqual((2, 2), res[0][0].c.shape)
+        self.assertEqual((2, 2), res[0][0].h.shape)
+        self.assertEqual((2, 2), res[0][1].c.shape)
+        self.assertEqual((2, 2), res[0][1].h.shape)
+
   def testEmbeddingAttentionDecoder(self):
     with self.test_session() as sess:
       with tf.variable_scope("root", initializer=tf.constant_initializer(0.5)):
diff --git a/tensorflow/python/kernel_tests/spacetobatch_op_test.py b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
index f90abb95e8a..f3ff2d517af 100644
--- a/tensorflow/python/kernel_tests/spacetobatch_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
@@ -27,16 +27,15 @@ class SpaceToBatchTest(tf.test.TestCase):
   """Tests input-output pairs for the SpaceToBatch and BatchToSpace ops."""
 
   def _testPad(self, inputs, paddings, block_size, outputs):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        # outputs = space_to_batch(inputs)
-        x_tf = tf.space_to_batch(
-            tf.to_float(inputs), paddings, block_size=block_size)
-        self.assertAllEqual(x_tf.eval(), outputs)
-        # inputs = batch_to_space(outputs)
-        x_tf = tf.batch_to_space(
-            tf.to_float(outputs), paddings, block_size=block_size)
-        self.assertAllEqual(x_tf.eval(), inputs)
+    with self.test_session():
+      # outputs = space_to_batch(inputs)
+      x_tf = tf.space_to_batch(
+          tf.to_float(inputs), paddings, block_size=block_size)
+      self.assertAllEqual(x_tf.eval(), outputs)
+      # inputs = batch_to_space(outputs)
+      x_tf = tf.batch_to_space(
+          tf.to_float(outputs), paddings, block_size=block_size)
+      self.assertAllEqual(x_tf.eval(), inputs)
 
   def _testOne(self, inputs, block_size, outputs):
     paddings = np.zeros((2, 2), dtype=np.int32)
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index d0f31d14137..29b57e80944 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -119,13 +119,16 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
 class SparseMergeTest(test_util.TensorFlowTestCase):
 
   def _SparseTensor_3x50(self, indices_dtype, values_dtype):
+    # NOTE: This input is intentionally not sorted to validate the
+    # already_sorted flag below.
     ind = np.array([
         [0, 0],
-        [1, 0], [1, 1], [1, 2],
-        [2, 0], [2, 1]])
+        [1, 0], [1, 2],
+        [2, 0], [2, 1],
+        [1, 1]])
     # NB: these are not sorted
-    indices = np.array([0, 13, 10, 14, 32, 33])
-    values = np.array([-3, 4, 1, 1, 5, 9])
+    indices = np.array([0, 13, 10, 33, 32, 14])
+    values = np.array([-3, 4, 1, 9, 5, 1])
     shape = np.array([3, 3])
     indices = ops.SparseTensor(
         constant_op.constant(ind, dtypes.int64),
@@ -137,6 +140,28 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
         constant_op.constant(shape, dtypes.int64))
     return indices, values
 
+  def _AssertResultsSorted(self, output, vocab_size):
+    self.assertAllEqual(
+        output.indices,
+        [[0, 0], [1, 10], [1, 13], [1, 14], [2, 32], [2, 33]])
+    self.assertAllEqual(
+        output.values,
+        [-3, 1, 4, 1, 5, 9])
+    self.assertAllEqual(
+        output.shape,
+        [3, vocab_size])
+
+  def _AssertResultsNotSorted(self, output, vocab_size):
+    self.assertAllEqual(
+        output.indices,
+        [[0, 0], [1, 13], [1, 10], [2, 33], [2, 32], [1, 14]])
+    self.assertAllEqual(
+        output.values,
+        [-3, 4, 1, 9, 5, 1])
+    self.assertAllEqual(
+        output.shape,
+        [3, vocab_size])
+
   def testInt32AndFloat32(self):
     vocab_size = 50
     with self.test_session(use_gpu=False) as sess:
@@ -144,15 +169,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
       output = sess.run(sp_output)
-      self.assertAllEqual(
-          output.indices,
-          [[0, 0], [1, 10], [1, 13], [1, 14], [2, 32], [2, 33]])
-      self.assertAllEqual(
-          output.values,
-          [-3, 1, 4, 1, 5, 9])
-      self.assertAllEqual(
-          output.shape,
-          [3, vocab_size])
+      self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat32(self):
     vocab_size = 50
@@ -161,15 +178,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
       output = sess.run(sp_output)
-      self.assertAllEqual(
-          output.indices,
-          [[0, 0], [1, 10], [1, 13], [1, 14], [2, 32], [2, 33]])
-      self.assertAllEqual(
-          output.values,
-          [-3, 1, 4, 1, 5, 9])
-      self.assertAllEqual(
-          output.shape,
-          [3, vocab_size])
+      self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat64(self):
     vocab_size = 50
@@ -178,15 +187,37 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
       output = sess.run(sp_output)
-      self.assertAllEqual(
-          output.indices,
-          [[0, 0], [1, 10], [1, 13], [1, 14], [2, 32], [2, 33]])
-      self.assertAllEqual(
-          output.values,
-          [-3, 1, 4, 1, 5, 9])
-      self.assertAllEqual(
-          output.shape,
-          [3, vocab_size])
+      self._AssertResultsSorted(output, vocab_size)
+
+  def testInt32AndFloat32NonCanonicalOrder(self):
+    vocab_size = 50
+    with self.test_session(use_gpu=False) as sess:
+      indices, values = self._SparseTensor_3x50(dtypes.int32, dtypes.float32)
+      sp_output = sparse_ops.sparse_merge(
+          indices, values, vocab_size, already_sorted=True)
+
+      output = sess.run(sp_output)
+      self._AssertResultsNotSorted(output, vocab_size)
+
+  def testInt64AndFloat32NonCanonicalOrder(self):
+    vocab_size = 50
+    with self.test_session(use_gpu=False) as sess:
+      indices, values = self._SparseTensor_3x50(dtypes.int64, dtypes.float32)
+      sp_output = sparse_ops.sparse_merge(
+          indices, values, vocab_size, already_sorted=True)
+
+      output = sess.run(sp_output)
+      self._AssertResultsNotSorted(output, vocab_size)
+
+  def testInt64AndFloat64NonCanonicalOrder(self):
+    vocab_size = 50
+    with self.test_session(use_gpu=False) as sess:
+      indices, values = self._SparseTensor_3x50(dtypes.int64, dtypes.float64)
+      sp_output = sparse_ops.sparse_merge(
+          indices, values, vocab_size, already_sorted=True)
+
+      output = sess.run(sp_output)
+      self._AssertResultsNotSorted(output, vocab_size)
 
 
 class SparseRetainTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
new file mode 100644
index 00000000000..6c2d8369799
--- /dev/null
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -0,0 +1,112 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.math_ops.matrix_inverse."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+
+class SvdOpTest(tf.test.TestCase):
+
+  def testWrongDimensions(self):
+    # The input to svd should be 2-dimensional tensor.
+    scalar = tf.constant(1.)
+    with self.assertRaises(ValueError):
+      tf.svd(scalar)
+    vector = tf.constant([1., 2.])
+    with self.assertRaises(ValueError):
+      tf.svd(vector)
+    tensor = tf.constant([[[1., 2.], [3., 4.]], [[1., 2.], [3., 4.]]])
+    with self.assertRaises(ValueError):
+      tf.svd(tensor)
+
+    # The input to batch_svd should be a tensor of at least rank 2.
+    scalar = tf.constant(1.)
+    with self.assertRaises(ValueError):
+      tf.batch_svd(scalar)
+    vector = tf.constant([1., 2.])
+    with self.assertRaises(ValueError):
+      tf.batch_svd(vector)
+
+
+def _GetSvdOpTest(dtype_, shape_):
+
+  def _CompareSingularVectors(self, x, y, atol):
+    # Singular vectors are only unique up to sign (complex phase factor for
+    # complex matrices), so we normalize the signs first.
+    signs = np.sign(np.sum(np.divide(x, y), -2, keepdims=True))
+    x *= signs
+    self.assertAllClose(x, y, atol=atol)
+
+  def Test(self):
+    np.random.seed(1)
+    x = np.random.uniform(
+        low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_)
+    if dtype_ == np.float32:
+      atol = 1e-4
+    else:
+      atol = 1e-14
+    for compute_uv in False, True:
+      for full_matrices in False, True:
+        with self.test_session():
+          if x.ndim == 2:
+            if compute_uv:
+              tf_s, tf_u, tf_v = tf.svd(tf.constant(x),
+                                        compute_uv=compute_uv,
+                                        full_matrices=full_matrices)
+            else:
+              tf_s = tf.svd(tf.constant(x),
+                            compute_uv=compute_uv,
+                            full_matrices=full_matrices)
+          else:
+            if compute_uv:
+              tf_s, tf_u, tf_v = tf.batch_svd(
+                  tf.constant(x),
+                  compute_uv=compute_uv,
+                  full_matrices=full_matrices)
+            else:
+              tf_s = tf.batch_svd(
+                  tf.constant(x),
+                  compute_uv=compute_uv,
+                  full_matrices=full_matrices)
+          if compute_uv:
+            np_u, np_s, np_v = np.linalg.svd(x,
+                                             compute_uv=compute_uv,
+                                             full_matrices=full_matrices)
+          else:
+            np_s = np.linalg.svd(x,
+                                 compute_uv=compute_uv,
+                                 full_matrices=full_matrices)
+          self.assertAllClose(np_s, tf_s.eval(), atol=atol)
+          if compute_uv:
+            _CompareSingularVectors(self, np_u, tf_u.eval(), atol)
+            _CompareSingularVectors(self, np.swapaxes(np_v, -2, -1),
+                                    tf_v.eval(), atol)
+
+  return Test
+
+
+if __name__ == '__main__':
+  for dtype in np.float32, np.float64:
+    for m in 1, 2, 5, 10:
+      for n in 1, 2, 5, 10:
+        for batch_dims in [(), (3,)] + [(3, 2)] * (max(m, n) < 10):
+          shape = batch_dims + (m, n)
+          name = '%s_%s' % (dtype.__name__, '_'.join(map(str, shape)))
+          setattr(SvdOpTest, 'testSvd_' + name, _GetSvdOpTest(dtype, shape))
+  tf.test.main()
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index b5f4288871e..f6282439ae4 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -69,6 +69,16 @@ class VariableScopeTest(tf.test.TestCase):
           sess.run(tf.initialize_variables([w]))
           self.assertAllClose(w.eval(), 0.3)
 
+  def testVarScopeDType(self):
+    with self.test_session():
+      with tf.variable_scope("tower") as tower:
+        with tf.variable_scope("foo", dtype=tf.float16):
+          v = tf.get_variable("v", [])
+          self.assertEqual(v.dtype, tf.float16_ref)
+        with tf.variable_scope(tower, dtype=tf.float16):
+          w = tf.get_variable("w", [])
+          self.assertEqual(w.dtype, tf.float16_ref)
+
   def testInitFromNonTensorValue(self):
     with self.test_session() as sess:
       v = tf.get_variable("v", initializer=4, dtype=tf.int32)
diff --git a/tensorflow/python/kernel_tests/zero_division_test.py b/tensorflow/python/kernel_tests/zero_division_test.py
index 9a0d28bf2e3..e635aff84d5 100644
--- a/tensorflow/python/kernel_tests/zero_division_test.py
+++ b/tensorflow/python/kernel_tests/zero_division_test.py
@@ -25,31 +25,30 @@ import tensorflow as tf
 class ZeroDivisionTest(tf.test.TestCase):
 
   def testZeros(self):
-    for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
-        for dtype in tf.uint8, tf.int16, tf.int32, tf.int64:
-          zero = tf.constant(0, dtype=dtype)
-          one = tf.constant(1, dtype=dtype)
-          bads = [one // zero]
-          if dtype in (tf.int32, tf.int64):
-            bads.append(one % zero)
-          for bad in bads:
-            try:
-              result = bad.eval()
-            except tf.OpError as e:
-              # Ideally, we'd get a nice exception.  In theory, this should only
-              # happen on CPU, but 32 bit integer GPU division is actually on
-              # CPU due to a placer bug.
-              # TODO(irving): Make stricter once the placer bug is fixed.
-              self.assertIn('Integer division by zero', str(e))
-            else:
-              # On the GPU, integer division by zero produces all bits set.
-              # But apparently on some GPUs "all bits set" for 64 bit division
-              # means 32 bits set, so we allow 0xffffffff as well.  This isn't
-              # very portable, so we may need to expand this list if other GPUs
-              # do different things.
-              self.assertTrue(use_gpu)
-              self.assertIn(result, (-1, 0xff, 0xffffffff))
+    with self.test_session():
+      for dtype in tf.uint8, tf.int16, tf.int32, tf.int64:
+        zero = tf.constant(0, dtype=dtype)
+        one = tf.constant(1, dtype=dtype)
+        bads = [one // zero]
+        if dtype in (tf.int32, tf.int64):
+          bads.append(one % zero)
+        for bad in bads:
+          try:
+            result = bad.eval()
+          except tf.OpError as e:
+            # Ideally, we'd get a nice exception.  In theory, this should only
+            # happen on CPU, but 32 bit integer GPU division is actually on
+            # CPU due to a placer bug.
+            # TODO(irving): Make stricter once the placer bug is fixed.
+            self.assertIn('Integer division by zero', str(e))
+          else:
+            # On the GPU, integer division by zero produces all bits set.
+            # But apparently on some GPUs "all bits set" for 64 bit division
+            # means 32 bits set, so we allow 0xffffffff as well.  This isn't
+            # very portable, so we may need to expand this list if other GPUs
+            # do different things.
+            self.assertTrue(tf.test.is_gpu_available())
+            self.assertIn(result, (-1, 0xff, 0xffffffff))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/lib/io/file_io.i b/tensorflow/python/lib/io/file_io.i
index 42fbb906638..4e1c2aba69d 100644
--- a/tensorflow/python/lib/io/file_io.i
+++ b/tensorflow/python/lib/io/file_io.i
@@ -17,13 +17,15 @@ limitations under the License.
 %include "tensorflow/python/platform/base.i"
 
 %{
+#include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/match.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_statistics.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
-#include "tensorflow/core/util/tf_status_helper.h"
 %}
 
 %{
@@ -82,7 +84,7 @@ void CreateDir(const string& dirname, TF_Status* out_status) {
 
 void CopyFile(const string& oldpath, const string& newpath, bool overwrite,
               TF_Status* out_status) {
-  // If overwrite is false and the newpath file exists then its an error.
+  // If overwrite is false and the newpath file exists then it's an error.
   if (!overwrite && FileExists(newpath)) {
     TF_SetStatus(out_status, TF_ALREADY_EXISTS, "file already exists");
     return;
@@ -142,6 +144,17 @@ bool IsDirectory(const string& dirname, TF_Status* out_status) {
   }
   return false;
 }
+
+using tensorflow::FileStatistics;
+
+void Stat(const string& filename, FileStatistics* stats,
+          TF_Status* out_status) {
+  tensorflow::Status status = tensorflow::Env::Default()->Stat(filename,
+                                                               stats);
+  if (!status.ok()) {
+    Set_TF_Status_from_Status(out_status, status);
+  }
+}
 %}
 
 // Wrap the above functions.
@@ -159,3 +172,8 @@ void RenameFile(const string& oldname, const string& newname, bool overwrite,
                 TF_Status* out_status);
 void DeleteRecursively(const string& dirname, TF_Status* out_status);
 bool IsDirectory(const string& dirname, TF_Status* out_status);
+void Stat(const string& filename, tensorflow::FileStatistics* stats,
+          TF_Status* out_status);
+
+%include "tensorflow/core/lib/io/path.h"
+%include "tensorflow/core/platform/file_statistics.h"
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index a0ec199d3a0..9467e4b3456 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """File IO methods that wrap the C++ FileSystem API.
 
 The C++ FileSystem API is SWIG wrapped in file_io.i. These functions call those
@@ -22,6 +21,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
@@ -43,8 +44,8 @@ def read_file_to_string(filename):
 
 def write_string_to_file(filename, file_content):
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.WriteStringToFile(compat.as_bytes(filename),
-                                        compat.as_bytes(file_content), status)
+    pywrap_tensorflow.WriteStringToFile(
+        compat.as_bytes(filename), compat.as_bytes(file_content), status)
 
 
 def get_matching_files(filename):
@@ -61,22 +62,21 @@ def recursive_create_dir(dirname):
   with errors.raise_exception_on_not_ok_status() as status:
     dirs = dirname.split('/')
     for i in range(len(dirs)):
-      partial_dir = '/'.join(dirs[0:i+1])
+      partial_dir = '/'.join(dirs[0:i + 1])
       if partial_dir and not file_exists(partial_dir):
         pywrap_tensorflow.CreateDir(compat.as_bytes(partial_dir), status)
 
 
 def copy(oldpath, newpath, overwrite=False):
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.CopyFile(compat.as_bytes(oldpath),
-                               compat.as_bytes(newpath), overwrite, status)
+    pywrap_tensorflow.CopyFile(
+        compat.as_bytes(oldpath), compat.as_bytes(newpath), overwrite, status)
 
 
 def rename(oldname, newname, overwrite=False):
   with errors.raise_exception_on_not_ok_status() as status:
-    return pywrap_tensorflow.RenameFile(compat.as_bytes(oldname),
-                                        compat.as_bytes(newname), overwrite,
-                                        status)
+    return pywrap_tensorflow.RenameFile(
+        compat.as_bytes(oldname), compat.as_bytes(newname), overwrite, status)
 
 
 def delete_recursively(dirname):
@@ -87,3 +87,74 @@ def delete_recursively(dirname):
 def is_directory(dirname):
   with errors.raise_exception_on_not_ok_status() as status:
     return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
+
+
+def list_directory(dirname):
+  """Returns a list of entries contained within a directory.
+
+  The list is in arbitrary order. It does not contain the special entries "."
+  and "..".
+
+  Args:
+    dirname: string, path to a directory
+
+  Raises:
+    NotFoundError if directory doesn't exist
+
+  Returns:
+    [filename1, filename2, ... filenameN]
+  """
+  if not is_directory(dirname):
+    raise errors.NotFoundError(None, None, 'Could not find directory')
+  file_list = get_matching_files(os.path.join(compat.as_str_any(dirname), '*'))
+  return [compat.as_bytes(pywrap_tensorflow.Basename(compat.as_bytes(filename)))
+          for filename in file_list]
+
+
+def walk(top, in_order=True):
+  """Recursive directory tree generator for directories.
+
+  Args:
+    top: string, a Directory name
+    in_order: bool, Traverse in order if True, post order if False.
+
+  Errors that happen while listing directories are ignored.
+
+  Yields:
+    # Each yield is a 3-tuple:  the pathname of a directory, followed
+    # by lists of all its subdirectories and leaf files.
+    (dirname, [subdirname, subdirname, ...], [filename, filename, ...])
+  """
+  top = compat.as_bytes(top)
+  try:
+    listing = list_directory(top)
+  except errors.NotFoundError:
+    return
+
+  files = []
+  subdirs = []
+  for item in listing:
+    full_path = os.path.join(top, item)
+    if is_directory(full_path):
+      subdirs.append(item)
+    else:
+      files.append(item)
+
+  here = (top, subdirs, files)
+
+  if in_order:
+    yield here
+
+  for subdir in subdirs:
+    for subitem in walk(os.path.join(top, subdir), in_order):
+      yield subitem
+
+  if not in_order:
+    yield here
+
+
+def stat(filename):
+  file_statistics = pywrap_tensorflow.FileStatistics()
+  with errors.raise_exception_on_not_ok_status() as status:
+    pywrap_tensorflow.Stat(compat.as_bytes(filename), file_statistics, status)
+    return file_statistics
diff --git a/tensorflow/python/lib/io/file_io_test.py b/tensorflow/python/lib/io/file_io_test.py
index b47b687a2ac..1b95d1b403a 100644
--- a/tensorflow/python/lib/io/file_io_test.py
+++ b/tensorflow/python/lib/io/file_io_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-
 """Testing File IO operations in file_io.py."""
 from __future__ import absolute_import
 from __future__ import division
@@ -24,6 +23,7 @@ import tensorflow as tf
 
 from tensorflow.python.framework import errors
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.util import compat
 
 
 class FileIoTest(tf.test.TestCase):
@@ -60,9 +60,9 @@ class FileIoTest(tf.test.TestCase):
       file_path = os.path.join(dir_path, name)
       file_io.write_string_to_file(file_path, "testing")
     expected_match = [os.path.join(dir_path, name) for name in files]
-    self.assertItemsEqual(file_io.get_matching_files(os.path.join(dir_path,
-                                                                  "file*.txt")),
-                          expected_match)
+    self.assertItemsEqual(
+        file_io.get_matching_files(os.path.join(dir_path, "file*.txt")),
+        expected_match)
     file_io.delete_recursively(dir_path)
     self.assertFalse(file_io.file_exists(os.path.join(dir_path, "file3.txt")))
 
@@ -144,5 +144,117 @@ class FileIoTest(tf.test.TestCase):
     # False for a file.
     self.assertFalse(file_io.is_directory(file_path))
 
+  def testListDirectory(self):
+    dir_path = os.path.join(self._base_dir, "test_dir")
+    file_io.create_dir(dir_path)
+    files = [b"file1.txt", b"file2.txt", b"file3.txt"]
+    for name in files:
+      file_path = os.path.join(dir_path, compat.as_str_any(name))
+      file_io.write_string_to_file(file_path, "testing")
+    subdir_path = os.path.join(dir_path, "sub_dir")
+    file_io.create_dir(subdir_path)
+    subdir_file_path = os.path.join(subdir_path, "file4.txt")
+    file_io.write_string_to_file(subdir_file_path, "testing")
+    dir_list = file_io.list_directory(dir_path)
+    self.assertItemsEqual(files + [b"sub_dir"], dir_list)
+
+  def testListDirectoryFailure(self):
+    dir_path = os.path.join(self._base_dir, "test_dir")
+    with self.assertRaises(errors.NotFoundError):
+      file_io.list_directory(dir_path)
+
+  def _setupWalkDirectories(self, dir_path):
+    # Creating a file structure as follows
+    # test_dir -> file: file1.txt; dirs: subdir1_1, subdir1_2, subdir1_3
+    # subdir1_1 -> file: file3.txt
+    # subdir1_2 -> dir: subdir2
+    file_io.create_dir(dir_path)
+    file_io.write_string_to_file(os.path.join(dir_path, "file1.txt"), "testing")
+    sub_dirs1 = ["subdir1_1", "subdir1_2", "subdir1_3"]
+    for name in sub_dirs1:
+      file_io.create_dir(os.path.join(dir_path, name))
+    file_io.write_string_to_file(
+        os.path.join(dir_path, "subdir1_1/file2.txt"), "testing")
+    file_io.create_dir(os.path.join(dir_path, "subdir1_2/subdir2"))
+
+  def testWalkInOrder(self):
+    dir_path = os.path.join(self._base_dir, "test_dir")
+    self._setupWalkDirectories(dir_path)
+    # Now test the walk (in_order = True)
+    all_dirs = []
+    all_subdirs = []
+    all_files = []
+    for (w_dir, w_subdirs, w_files) in file_io.walk(dir_path, in_order=True):
+      all_dirs.append(w_dir)
+      all_subdirs.append(w_subdirs)
+      all_files.append(w_files)
+    self.assertItemsEqual(all_dirs, [compat.as_bytes(dir_path)] + [
+        compat.as_bytes(os.path.join(dir_path, item))
+        for item in ["subdir1_1", "subdir1_2", "subdir1_2/subdir2", "subdir1_3"]
+    ])
+    self.assertEqual(compat.as_bytes(dir_path), all_dirs[0])
+    self.assertLess(
+        all_dirs.index(compat.as_bytes(os.path.join(dir_path, "subdir1_2"))),
+        all_dirs.index(
+            compat.as_bytes(os.path.join(dir_path, "subdir1_2/subdir2"))))
+    self.assertItemsEqual(all_subdirs[1:5], [[], [b"subdir2"], [], []])
+    self.assertItemsEqual(all_subdirs[0],
+                          [b"subdir1_1", b"subdir1_2", b"subdir1_3"])
+    self.assertItemsEqual(all_files, [[b"file1.txt"], [b"file2.txt"], [], [],
+                                      []])
+    self.assertLess(
+        all_files.index([b"file1.txt"]), all_files.index([b"file2.txt"]))
+
+  def testWalkPostOrder(self):
+    dir_path = os.path.join(self._base_dir, "test_dir")
+    self._setupWalkDirectories(dir_path)
+    # Now test the walk (in_order = False)
+    all_dirs = []
+    all_subdirs = []
+    all_files = []
+    for (w_dir, w_subdirs, w_files) in file_io.walk(dir_path, in_order=False):
+      all_dirs.append(w_dir)
+      all_subdirs.append(w_subdirs)
+      all_files.append(w_files)
+    self.assertItemsEqual(all_dirs, [
+        compat.as_bytes(os.path.join(dir_path, item))
+        for item in ["subdir1_1", "subdir1_2/subdir2", "subdir1_2", "subdir1_3"]
+    ] + [compat.as_bytes(dir_path)])
+    self.assertEqual(compat.as_bytes(dir_path), all_dirs[4])
+    self.assertLess(
+        all_dirs.index(
+            compat.as_bytes(os.path.join(dir_path, "subdir1_2/subdir2"))),
+        all_dirs.index(compat.as_bytes(os.path.join(dir_path, "subdir1_2"))))
+    self.assertItemsEqual(all_subdirs[0:4], [[], [], [b"subdir2"], []])
+    self.assertItemsEqual(all_subdirs[4],
+                          [b"subdir1_1", b"subdir1_2", b"subdir1_3"])
+    self.assertItemsEqual(all_files, [[b"file2.txt"], [], [], [],
+                                      [b"file1.txt"]])
+    self.assertLess(
+        all_files.index([b"file2.txt"]), all_files.index([b"file1.txt"]))
+
+  def testWalkFailure(self):
+    dir_path = os.path.join(self._base_dir, "test_dir")
+    # Try walking a directory that wasn't created.
+    all_dirs = []
+    all_subdirs = []
+    all_files = []
+    for (w_dir, w_subdirs, w_files) in file_io.walk(dir_path, in_order=False):
+      all_dirs.append(w_dir)
+      all_subdirs.append(w_subdirs)
+      all_files.append(w_files)
+    self.assertItemsEqual(all_dirs, [])
+    self.assertItemsEqual(all_subdirs, [])
+    self.assertItemsEqual(all_files, [])
+
+  def testStat(self):
+    file_path = os.path.join(self._base_dir, "temp_file")
+    file_io.write_string_to_file(file_path, "testing")
+    file_statistics = file_io.stat(file_path)
+    os_statistics = os.stat(file_path)
+    self.assertEquals(7, file_statistics.length)
+    self.assertEqual(
+        int(os_statistics.st_mtime), int(file_statistics.mtime_nsec / 1e9))
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index b663c56e56a..0227f682343 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -151,7 +151,7 @@ def _SliceGrad(op, grad):
 
 @ops.RegisterGradient("StridedSlice")
 def _StridedSliceGrad(op, grad):
-  """Gradient for unpack op."""
+  """Gradient for StridedSlice op."""
   x = array_ops.shape(op.inputs[0])
   begin = op.inputs[1]
   end = op.inputs[2]
@@ -170,6 +170,25 @@ def _StridedSliceGrad(op, grad):
       shrink_axis_mask=op.get_attr("shrink_axis_mask")), None, None, None
 
 
+@ops.RegisterGradient("StridedSliceGrad")
+def _StridedSliceGradGrad(op, grad):
+  """Gradient for StridedSliceGrad op."""
+  begin = op.inputs[1]
+  end = op.inputs[2]
+  strides = op.inputs[3]
+
+  return None, None, None, None, array_ops.strided_slice(
+      grad,
+      begin,
+      end,
+      strides,
+      begin_mask=op.get_attr("begin_mask"),
+      end_mask=op.get_attr("end_mask"),
+      ellipsis_mask=op.get_attr("ellipsis_mask"),
+      new_axis_mask=op.get_attr("new_axis_mask"),
+      shrink_axis_mask=op.get_attr("shrink_axis_mask"))
+
+
 @ops.RegisterGradient("Split")
 def _SplitGrad(op, *grads):
   return None, array_ops.concat(op.inputs[0], list(grads))
@@ -255,6 +274,12 @@ def _GatherNdGrad(unused_op, unused_grad):
   raise NotImplementedError("Gradient for gather_nd is not implemented.")
 
 
+@ops.RegisterGradient("CheckNumerics")
+def _CheckNumericsGrad(_, grad):
+  """Gradient for check_numerics op."""
+  return grad
+
+
 @ops.RegisterGradient("Identity")
 def _IdGrad(_, grad):
   return grad
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 1435279f549..9931de7bd6b 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -46,6 +46,7 @@ TensorFlow provides several operations to slice or extract parts of a tensor,
 or join multiple tensors together.
 
 @@slice
+@@strided_slice
 @@split
 @@tile
 @@pad
@@ -196,7 +197,7 @@ def zeros_initializer(shape, dtype=dtypes.float32):
   return zeros(shape, dtype)
 
 
-def _NewSliceHelper(tensor, slice_spec):
+def _SliceHelper(tensor, slice_spec):
   """Overload for Tensor.__getitem__.
 
   This operation extracts the specified region from the tensor.
@@ -263,85 +264,24 @@ def _NewSliceHelper(tensor, slice_spec):
       shrink_axis_mask |= (1 << index)
     index += 1
 
-  return strided_slice(tensor,
-                       pack(begin),
-                       pack(end),
-                       pack(strides),
-                       begin_mask=begin_mask,
-                       end_mask=end_mask,
-                       shrink_axis_mask=shrink_axis_mask,
-                       new_axis_mask=new_axis_mask,
-                       ellipsis_mask=ellipsis_mask)
+  # pack possibly involves often involves no tensors, so we must use op_scope
+  # correct graph
+  with ops.op_scope([tensor] + begin + end + strides, None,
+                    "strided_slice") as name:
+    begin_pack, end_pack, strides_pack = pack(begin), pack(end), pack(strides)
+    return strided_slice(tensor,
+                         begin_pack,
+                         end_pack,
+                         strides_pack,
+                         begin_mask=begin_mask,
+                         end_mask=end_mask,
+                         shrink_axis_mask=shrink_axis_mask,
+                         new_axis_mask=new_axis_mask,
+                         ellipsis_mask=ellipsis_mask,
+                         name=name)
 
 
 # pylint: disable=undefined-variable,protected-access
-def _SliceHelper(tensor, slice_spec):
-  """Overload for Tensor.__getitem__.
-
-  Currently the size of the slice must be statically known in each dimension,
-  i.e. the "stop" of the slice must not be omitted.
-
-  TODO(mrry): Support slices where the sizes are not specified.
-  TODO(mrry): Support negative indices in slices with numpy/Python semantics.
-
-  Args:
-    tensor: An ops.Tensor object.
-    slice_spec: The arguments to Tensor.__getitem__.
-
-  Returns:
-    The appropriate slice of "tensor", based on "slice_spec".
-
-  Raises:
-    ValueError: If a slice range is negative size.
-    TypeError: If the slice indices aren't int, slice, or Ellipsis.
-  """
-  if not isinstance(slice_spec, (list, tuple)):
-    slice_spec = [slice_spec]
-  indices = []
-  sizes = []
-  squeeze_dims = []
-  for dim, s in enumerate(slice_spec):
-    if isinstance(s, _baseslice):
-      if s.step not in (None, 1):
-        raise NotImplementedError(
-            "Steps other than 1 are not currently supported")
-      start = s.start if s.start is not None else 0
-      if start < 0:
-        raise NotImplementedError(
-            "Negative start indices are not currently supported")
-      indices.append(start)
-      if s.stop is not None and s.stop < 0:
-        raise NotImplementedError(
-            "Negative stop indices are not currently supported")
-      # NOTE(mrry): If the stop is not specified, Python substitutes
-      #   sys.maxsize, which is typically (2 ** 63) - 1. Since Slice currently
-      #   supports signed DT_INT32 arguments, we use -1 to specify that all
-      #   elements should be captured.
-      if s.stop is None or s.stop == sys.maxsize:
-        sizes.append(-1)
-      else:
-        if start > s.stop:
-          raise ValueError("Stop must be at least start")
-        sizes.append(s.stop - start)
-    elif s is Ellipsis:
-      raise NotImplementedError("Ellipsis is not currently supported")
-    else:
-      try:
-        s = int(s)
-      except TypeError:
-        raise TypeError("Bad slice index %s of type %s" % (s, type(s)))
-      if s < 0:
-        raise NotImplementedError("Negative indices are currently unsupported")
-      indices.append(s)
-      sizes.append(1)
-      squeeze_dims.append(dim)
-  sliced = slice(tensor, indices, sizes)
-  if squeeze_dims:
-    return squeeze(sliced, squeeze_dims=squeeze_dims)
-  else:
-    return sliced
-
-
 def slice(input_, begin, size, name=None):
   """Extracts a slice from a tensor.
 
@@ -490,8 +430,6 @@ def strided_slice(input_,
                                      new_axis_mask=new_axis_mask,
                                      shrink_axis_mask=shrink_axis_mask)
 
-# TODO(aselle): When gradient is added and performance verified switch
-# ops.Tensor._override_operator("__getitem__", _NewSliceHelper)
 ops.Tensor._override_operator("__getitem__", _SliceHelper)
 
 
@@ -1595,8 +1533,9 @@ def _StridedSliceShape(op):
 
   sparse_dims = begin_shape.merge_with(end_shape).merge_with(strides_shape)[
       0].value
-  if sparse_dims is None:
-    return [input_shape.unknown_shape()]
+  if (sparse_dims is None or begin_value is None or end_value is None or
+      strides_value is None):
+    return [tensor_shape.unknown_shape()]
 
   begin_mask = op.get_attr("begin_mask")
   end_mask = op.get_attr("end_mask")
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 82300ce0d89..f28d19b4b0c 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -98,7 +98,7 @@ def assert_proper_iterable(values):
         'Expected argument "values" to be iterable.  Found: %s' % type(values))
 
 
-def assert_negative(x, data=None, summarize=None, name=None):
+def assert_negative(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x < 0` holds element-wise.
 
   Example of adding a dependency to an operation:
@@ -122,20 +122,23 @@ def assert_negative(x, data=None, summarize=None, name=None):
     data:  The tensors to print out if the condition is False.  Defaults to
       error message and first few entries of `x`.
     summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
     name: A name for this operation (optional).  Defaults to "assert_negative".
 
   Returns:
     Op raising `InvalidArgumentError` unless `x` is all negative.
   """
+  message = message or ''
   with ops.op_scope([x, data], name, 'assert_negative'):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
-      data = ['Condition x < 0 did not hold element-wise: x = ', x.name, x]
+      data = [
+          message, 'Condition x < 0 did not hold element-wise: x = ', x.name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less(x, zero, data=data, summarize=summarize)
 
 
-def assert_positive(x, data=None, summarize=None, name=None):
+def assert_positive(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x > 0` holds element-wise.
 
   Example of adding a dependency to an operation:
@@ -159,20 +162,23 @@ def assert_positive(x, data=None, summarize=None, name=None):
     data:  The tensors to print out if the condition is False.  Defaults to
       error message and first few entries of `x`.
     summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
     name: A name for this operation (optional).  Defaults to "assert_positive".
 
   Returns:
     Op raising `InvalidArgumentError` unless `x` is all positive.
   """
+  message = message or ''
   with ops.op_scope([x, data], name, 'assert_positive'):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
-      data = ['Condition x > 0 did not hold element-wise: x = ', x.name, x]
+      data = [
+          message, 'Condition x > 0 did not hold element-wise: x = ', x.name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less(zero, x, data=data, summarize=summarize)
 
 
-def assert_non_negative(x, data=None, summarize=None, name=None):
+def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x >= 0` holds element-wise.
 
   Example of adding a dependency to an operation:
@@ -196,21 +202,25 @@ def assert_non_negative(x, data=None, summarize=None, name=None):
     data:  The tensors to print out if the condition is False.  Defaults to
       error message and first few entries of `x`.
     summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
     name: A name for this operation (optional).
       Defaults to "assert_non_negative".
 
   Returns:
     Op raising `InvalidArgumentError` unless `x` is all non-negative.
   """
+  message = message or ''
   with ops.op_scope([x, data], name, 'assert_non_negative'):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
-      data = ['Condition x >= 0 did not hold element-wise: x = ', x.name, x]
+      data = [
+          message,
+          'Condition x >= 0 did not hold element-wise: x = ', x.name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less_equal(zero, x, data=data, summarize=summarize)
 
 
-def assert_non_positive(x, data=None, summarize=None, name=None):
+def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x <= 0` holds element-wise.
 
   Example of adding a dependency to an operation:
@@ -234,21 +244,25 @@ def assert_non_positive(x, data=None, summarize=None, name=None):
     data:  The tensors to print out if the condition is False.  Defaults to
       error message and first few entries of `x`.
     summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
     name: A name for this operation (optional).
       Defaults to "assert_non_positive".
 
   Returns:
     Op raising `InvalidArgumentError` unless `x` is all non-positive.
   """
+  message = message or ''
   with ops.op_scope([x, data], name, 'assert_non_positive'):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
-      data = ['Condition x <= 0 did not hold element-wise: x = ', x.name, x]
+      data = [
+          message,
+          'Condition x <= 0 did not hold element-wise: x = ', x.name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less_equal(x, zero, data=data, summarize=summarize)
 
 
-def assert_equal(x, y, data=None, summarize=None, name=None):
+def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x == y` holds element-wise.
 
   Example of adding a dependency to an operation:
@@ -274,16 +288,19 @@ def assert_equal(x, y, data=None, summarize=None, name=None):
     data:  The tensors to print out if the condition is False.  Defaults to
       error message and first few entries of `x`, `y`.
     summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
     name: A name for this operation (optional).  Defaults to "assert_equal".
 
   Returns:
     Op that raises `InvalidArgumentError` if `x == y` is False.
   """
+  message = message or ''
   with ops.op_scope([x, y, data], name, 'assert_equal'):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
     if data is None:
       data = [
+          message,
           'Condition x == y did not hold element-wise: x = ', x.name, x, 'y = ',
           y.name, y
       ]
@@ -291,7 +308,7 @@ def assert_equal(x, y, data=None, summarize=None, name=None):
     return logging_ops.Assert(condition, data, summarize=summarize)
 
 
-def assert_less(x, y, data=None, summarize=None, name=None):
+def assert_less(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x < y` holds element-wise.
 
   Example of adding a dependency to an operation:
@@ -317,16 +334,19 @@ def assert_less(x, y, data=None, summarize=None, name=None):
     data:  The tensors to print out if the condition is False.  Defaults to
       error message and first few entries of `x`, `y`.
     summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
     name: A name for this operation (optional).  Defaults to "assert_less".
 
   Returns:
     Op that raises `InvalidArgumentError` if `x < y` is False.
   """
+  message = message or ''
   with ops.op_scope([x, y, data], name, 'assert_less'):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
     if data is None:
       data = [
+          message,
           'Condition x < y did not hold element-wise: x = ', x.name, x, 'y = ',
           y.name, y
       ]
@@ -334,7 +354,7 @@ def assert_less(x, y, data=None, summarize=None, name=None):
     return logging_ops.Assert(condition, data, summarize=summarize)
 
 
-def assert_less_equal(x, y, data=None, summarize=None, name=None):
+def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x <= y` holds element-wise.
 
   Example of adding a dependency to an operation:
@@ -360,16 +380,19 @@ def assert_less_equal(x, y, data=None, summarize=None, name=None):
     data:  The tensors to print out if the condition is False.  Defaults to
       error message and first few entries of `x`, `y`.
     summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
     name: A name for this operation (optional).  Defaults to "assert_less_equal"
 
   Returns:
     Op that raises `InvalidArgumentError` if `x <= y` is False.
   """
+  message = message or ''
   with ops.op_scope([x, y, data], name, 'assert_less_equal'):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
     if data is None:
       data = [
+          message,
           'Condition x <= y did not hold element-wise: x = ', x.name, x, 'y = ',
           y.name, y
       ]
@@ -416,24 +439,23 @@ def _assert_rank_condition(x, rank, static_condition, dynamic_condition, data,
 
       if x_rank_static is not None:
         if not static_condition(x_rank_static, rank_static):
-          raise ValueError('Static rank condition failed', x_rank_static,
-                           rank_static)
+          raise ValueError(
+              'Static rank condition failed', x_rank_static, rank_static)
         return control_flow_ops.no_op(name='static_checks_determined_all_ok')
 
     condition = dynamic_condition(array_ops.rank(x), rank)
 
+    # Add the condition that `rank` must have rank zero.  Prevents the bug where
+    # someone does assert_rank(x, [n]), rather than assert_rank(x, n).
     if rank_static is None:
-      rank_check = assert_rank(rank,
-                               0,
-                               data=[['Rank must be a scalar.'
-                                      'Received rank:'], rank])
-
+      this_data = ['Rank must be a scalar. Received rank: ', rank]
+      rank_check = assert_rank(rank, 0, data=this_data)
       condition = control_flow_ops.with_dependencies([rank_check], condition)
 
   return logging_ops.Assert(condition, data, summarize=summarize)
 
 
-def assert_rank(x, rank, data=None, summarize=None, name=None):
+def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank equal to `rank`.
 
   Example of adding a dependency to an operation:
@@ -455,20 +477,24 @@ def assert_rank(x, rank, data=None, summarize=None, name=None):
     data:  The tensors to print out if the condition is False.  Defaults to
       error message and first few entries of `x`.
     summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
     name: A name for this operation (optional).  Defaults to "assert_rank".
 
   Returns:
     Op raising `InvalidArgumentError` unless `x` has specified rank.
+    If static checks determine `x` has correct rank, a `no_op` is returned.
 
   Raises:
     ValueError:  If static checks determine `x` has wrong rank.
   """
+  message = message or ''
 
   static_condition = lambda actual_rank, given_rank: actual_rank == given_rank
   dynamic_condition = math_ops.equal
 
   if data is None:
     data = [
+        message,
         'Tensor %s must have rank' % x.name, rank, 'Received shape: ',
         array_ops.shape(x)
     ]
@@ -480,15 +506,16 @@ def assert_rank(x, rank, data=None, summarize=None, name=None):
   except ValueError as e:
     if e.args[0] == 'Static rank condition failed':
       raise ValueError(
-          'Tensor %s must have rank %d.  Received rank %d, shape %s' %
-          (x.name, e.args[2], e.args[1], x.get_shape()))
+          '%s.  Tensor %s must have rank %d.  Received rank %d, shape %s' %
+          (message, x.name, e.args[2], e.args[1], x.get_shape()))
     else:
       raise
 
   return assert_op
 
 
-def assert_rank_at_least(x, rank, data=None, summarize=None, name=None):
+def assert_rank_at_least(
+    x, rank, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank equal to `rank` or higher.
 
   Example of adding a dependency to an operation:
@@ -510,20 +537,24 @@ def assert_rank_at_least(x, rank, data=None, summarize=None, name=None):
     data:  The tensors to print out if the condition is False.  Defaults to
       error message and first few entries of `x`.
     summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
     name: A name for this operation (optional).
       Defaults to "assert_rank_at_least".
 
   Returns:
     Op raising `InvalidArgumentError` unless `x` has specified rank or higher.
+    If static checks determine `x` has correct rank, a `no_op` is returned.
 
   Raises:
     ValueError:  If static checks determine `x` has wrong rank.
   """
+  message = message or ''
 
   static_condition = lambda actual_rank, given_rank: actual_rank >= given_rank
   dynamic_condition = math_ops.greater_equal
   if data is None:
     data = [
+        message,
         'Tensor %s must have rank at least' % x.name, rank,
         'Received shape: ', array_ops.shape(x)
     ]
@@ -535,15 +566,15 @@ def assert_rank_at_least(x, rank, data=None, summarize=None, name=None):
   except ValueError as e:
     if e.args[0] == 'Static rank condition failed':
       raise ValueError(
-          'Tensor %s must have rank at least %d.  Received rank %d, shape %s' %
-          (x.name, e.args[2], e.args[1], x.get_shape()))
+          '%s.  Tensor %s must have rank at least %d.  Received rank %d, shape '
+          '%s' % (message, x.name, e.args[2], e.args[1], x.get_shape()))
     else:
       raise
 
   return assert_op
 
 
-def assert_integer(x, data=None, summarize=None, name=None):
+def assert_integer(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
 
   Example of adding a dependency to an operation:
@@ -561,33 +592,50 @@ def assert_integer(x, data=None, summarize=None, name=None):
 
   Args:
     x: `Tensor` whose basetype is integer and is not quantized.
-    data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`.
-    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
     name: A name for this operation (optional).  Defaults to "assert_integer".
 
+  Raises:
+    TypeError:  If `x.dtype` is anything other than non-quantized integer.
+
   Returns:
-    Op that raises `InvalidArgumentError` if `x == y` is False.
+    A `no_op` that does nothing.  Type can be determined statically.
   """
+  message = message or ''
   with ops.op_scope([x], name, 'assert_integer'):
     x = ops.convert_to_tensor(x, name='x')
-    data = ['x is not of integer dtype: x = ', x.name, x]
-    condition = x.dtype.is_integer
-    return logging_ops.Assert(condition, data, summarize=summarize)
+    if not x.dtype.is_integer:
+      err_msg = (
+          '%s  Expected "x" to be integer type.  Found: %s of dtype %s'
+          % (message, x.name, x.dtype))
+      raise TypeError(err_msg)
+
+    return control_flow_ops.no_op('statically_determined_was_integer')
 
 
-def assert_type(tensor, tf_type):
-  """Asserts that the given `Tensor` is of the specified type.
+def assert_type(tensor, tf_type, message=None, name=None):
+  """Statically asserts that the given `Tensor` is of the specified type.
 
   Args:
     tensor: A tensorflow `Tensor`.
     tf_type: A tensorflow type (dtypes.float32, tf.int64, dtypes.bool, etc).
+    message: A string to prefix to the default message.
+    name:  A name to give this `Op`.  Defaults to "assert_type"
 
   Raises:
-    ValueError: If the tensors data type doesn't match tf_type.
+    TypeError: If the tensors data type doesn't match tf_type.
+
+  Returns:
+    A `no_op` that does nothing.  Type can be determined statically.
   """
-  if tensor.dtype != tf_type:
-    raise ValueError('%s must be of type %s' % (tensor.op.name, tf_type))
+  message = message or ''
+  with ops.op_scope([tensor], name, 'assert_type'):
+    tensor = ops.convert_to_tensor(tensor, name='tensor')
+    if tensor.dtype != tf_type:
+      raise TypeError(
+          '%s  %s must be of type %s' % (message, tensor.op.name, tf_type))
+
+    return control_flow_ops.no_op('statically_determined_correct_type')
 
 
 def _get_diff_for_monotonic_comparison(x):
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 18a7a20c11d..ae3770416f3 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -118,6 +118,8 @@ def _Identity(data, name=None):
     else:
       return array_ops.identity(data, name=name)
   else:
+    if not isinstance(data, (ops.IndexedSlices, ops.SparseTensor)):
+      raise TypeError("Type %s not supported" % type(data))
     values = _Identity(data.values, name=name)
     indices = array_ops.identity(data.indices, name="indices")
     if isinstance(data, ops.IndexedSlices):
@@ -125,11 +127,9 @@ def _Identity(data, name=None):
       if dense_shape is not None:
         dense_shape = array_ops.identity(dense_shape, name="dense_shape")
       return ops.IndexedSlices(values, indices, dense_shape)
-    elif isinstance(data, ops.SparseTensor):
+    else:
       dense_shape = array_ops.identity(data.shape, name="dense_shape")
       return ops.SparseTensor(indices, values, dense_shape)
-    else:
-      raise TypeError("Type %s not supported" % type(data))
 
 
 def _NextIteration(data, name=None):
@@ -140,6 +140,8 @@ def _NextIteration(data, name=None):
     else:
       return next_iteration(data, name=name)
   else:
+    if not isinstance(data, (ops.IndexedSlices, ops.SparseTensor)):
+      raise TypeError("Type %s not supported" % type(data))
     values = _NextIteration(data.values, name=name)
     indices = next_iteration(data.indices, name="indices")
     if isinstance(data, ops.IndexedSlices):
@@ -147,11 +149,9 @@ def _NextIteration(data, name=None):
       if dense_shape is not None:
         dense_shape = next_iteration(dense_shape, name="dense_shape")
       return ops.IndexedSlices(values, indices, dense_shape)
-    elif isinstance(data, ops.SparseTensor):
+    else:
       dense_shape = next_iteration(data.shape, name="dense_shape")
       return ops.SparseTensor(indices, values, dense_shape)
-    else:
-      raise TypeError("Type %s not supported" % type(data))
 
 
 def _Enter(data, frame_name, is_constant=False, parallel_iterations=10,
@@ -183,6 +183,8 @@ def _Enter(data, frame_name, is_constant=False, parallel_iterations=10,
       return enter(data, frame_name, is_constant, parallel_iterations,
                    name=name)
   else:
+    if not isinstance(data, (ops.IndexedSlices, ops.SparseTensor)):
+      raise TypeError("Type %s not supported" % type(data))
     values = _Enter(data.values, frame_name, is_constant,
                     parallel_iterations, name=name)
     indices = enter(data.indices, frame_name, is_constant,
@@ -193,12 +195,10 @@ def _Enter(data, frame_name, is_constant=False, parallel_iterations=10,
         dense_shape = enter(dense_shape, frame_name, is_constant,
                             parallel_iterations, name="dense_shape")
       return ops.IndexedSlices(values, indices, dense_shape)
-    elif isinstance(data, ops.SparseTensor):
+    else:
       dense_shape = enter(data.shape, frame_name, is_constant,
                           parallel_iterations, name="dense_shape")
       return ops.SparseTensor(indices, values, dense_shape)
-    else:
-      raise TypeError("Type %s not supported" % type(data))
 
 
 def exit(data, name=None):
@@ -220,6 +220,8 @@ def exit(data, name=None):
     else:
       return gen_control_flow_ops._exit(data, name)
   else:
+    if not isinstance(data, (ops.IndexedSlices, ops.SparseTensor)):
+      raise TypeError("Type %s not supported" % type(data))
     values = exit(data.values, name=name)
     indices = gen_control_flow_ops._exit(data.indices, name="indices")
     if isinstance(data, ops.IndexedSlices):
@@ -227,11 +229,9 @@ def exit(data, name=None):
       if dense_shape is not None:
         dense_shape = gen_control_flow_ops._exit(dense_shape, name)
       return ops.IndexedSlices(values, indices, dense_shape)
-    elif isinstance(data, ops.SparseTensor):
+    else:
       dense_shape = gen_control_flow_ops._exit(data.shape, name)
       return ops.SparseTensor(indices, values, dense_shape)
-    else:
-      raise TypeError("Type %s not supported" % type(data))
 
 
 def switch(data, pred, dtype=None, name=None):
@@ -348,9 +348,11 @@ def merge(inputs, name=None):
     A tuple containing the chosen input tensor and its index in `inputs`.
 
   Raises:
-    ValueError: If inputs are IndexedSlices and some but not all have a
-      dense_shape property.
+    ValueError: If any of the inputs is None, or inputs are IndexedSlices and
+      some but not all have a dense_shape property.
   """
+  if any([inp is None for inp in inputs]):
+    raise ValueError("At least one of the merge inputs is None: %s" % inputs)
   with ops.op_scope(inputs, name, "Merge") as name:
     inputs = [ops.convert_to_tensor_or_indexed_slices(inp, as_ref=True)
               for inp in inputs]
@@ -1493,8 +1495,15 @@ class WhileContext(ControlFlowContext):
     self._AddOpInternal(op)
 
   def _AddOpInternal(self, op):
-    """Add `op` to the current context."""
+    """Add `op` to the current context.
+
+    In the case that op has only external data inputs, we remove all of its
+    external control inputs so all its inputs are in the same while loop
+    context. This is valid because op now has an Enter input that has all
+    the right control dependency.
+    """
     if not op.inputs:
+      # Remove any external control dependency on this op
       control_inputs = [x for x in op.control_inputs
                         if x._get_control_flow_context() == self]
       if len(control_inputs) != len(op.control_inputs):
@@ -1508,12 +1517,22 @@ class WhileContext(ControlFlowContext):
       for x in op.outputs:
         self._values.add(x.name)
     else:
+      has_internal_data_input = False
       for index in range(len(op.inputs)):
         x = op.inputs[index]
         self.AddValue(x)
         real_x = self._external_values.get(x.name)
         if real_x is not None:
           op._update_input(index, real_x)
+        else:
+          has_internal_data_input = True
+      if not has_internal_data_input:
+        # Remove any external control dependency on this op
+        control_inputs = [x for x in op.control_inputs
+                          if x._get_control_flow_context() == self]
+        if len(control_inputs) != len(op.control_inputs):
+          del op.control_inputs[:]
+          op._add_control_inputs(control_inputs)
       # Add a control dependency to prevent loop invariants from
       # enabling ops that should not be executed.
       self._MaybeAddControlDependency(op)
@@ -1879,6 +1898,8 @@ class WhileContext(ControlFlowContext):
       if isinstance(e, ops.Tensor):
         xs = [e]
       else:
+        if not isinstance(e, (ops.IndexedSlices, ops.SparseTensor)):
+          raise TypeError("Type %s not supported" % type(e))
         xs = [e.values, e.indices]
         shape = e.dense_shape if isinstance(e, ops.IndexedSlices) else e.shape
         if shape is not None:
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 43630c2a726..c50fbcd25d0 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -209,7 +209,7 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
 
 
 def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
-           swap_memory=False, name=None):
+           swap_memory=False, infer_shape=True, name=None):
   """map on the list of tensors unpacked from `elems` on dimension 0.
 
   The simplest version of `map` repeatedly applies the callable `fn` to a
@@ -248,6 +248,7 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
       in parallel.
     back_prop: (optional) True enables support for back propagation.
     swap_memory: (optional) True enables GPU-CPU memory swapping.
+    infer_shape: (optional) False disables tests for consistent output shapes.
     name: (optional) Name prefix for the returned tensors.
 
   Returns:
@@ -335,7 +336,7 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
     accs_ta = [
         tensor_array_ops.TensorArray(dtype=dt, size=n,
                                      dynamic_size=False,
-                                     infer_shape=True)
+                                     infer_shape=infer_shape)
         for dt in dtype_flat]
 
     def compute(i, tas):
@@ -380,7 +381,7 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
 
 
 def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
-         swap_memory=False, name=None):
+         swap_memory=False, infer_shape=True, name=None):
   """scan on the list of tensors unpacked from `elems` on dimension 0.
 
   The simplest version of `scan` repeatedly applies the callable `fn` to a
@@ -429,6 +430,7 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
       in parallel.
     back_prop: (optional) True enables support for back propagation.
     swap_memory: (optional) True enables GPU-CPU memory swapping.
+    infer_shape: (optional) False disables tests for consistent output shapes.
     name: (optional) Name prefix for the returned tensors.
 
   Returns:
@@ -523,7 +525,7 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     accs_ta = [
         tensor_array_ops.TensorArray(dtype=init.dtype, size=n,
                                      dynamic_size=False,
-                                     infer_shape=True)
+                                     infer_shape=infer_shape)
         for init in a_flat]
 
     if initializer is None:
diff --git a/tensorflow/python/ops/gradients.py b/tensorflow/python/ops/gradients.py
index efd0826e566..27b7f044039 100644
--- a/tensorflow/python/ops/gradients.py
+++ b/tensorflow/python/ops/gradients.py
@@ -192,9 +192,6 @@ def _PendingCount(graph, to_ops, from_ops, colocate_gradients_with_ops):
     for x in op.inputs:
       if between_ops[x.op._id]:
         pending_count[x.op._id] += 1
-    for x in op.control_inputs:
-      if between_ops[x._id]:
-        pending_count[x._id] += 1
 
   return pending_count, loop_state
 
@@ -361,6 +358,7 @@ def gradients(ys,
     grad_ys = [None] * len(ys)
   else:
     grad_ys = _AsList(grad_ys)
+
   with ops.op_scope(ys + xs + grad_ys, name, "gradients"):
     ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
     xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x")
@@ -512,10 +510,6 @@ def gradients(ys,
                    control_flow_ops.IsLoopSwitch(x.op))
         if ready:
           queue.append(x.op)
-      for x in op.control_inputs:
-        pending_count[x._id] -= 1
-        if pending_count[x._id] is 0:
-          queue.append(x)
       # pylint: enable=protected-access
 
   if loop_state:
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index f63cf812474..abed6b5777a 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -846,8 +846,7 @@ def per_image_whitening(image):
   stddev = math_ops.sqrt(variance)
 
   # Apply a minimum normalization that protects us against uniform images.
-  min_stddev = math_ops.inv(
-      math_ops.sqrt(math_ops.cast(num_pixels, dtypes.float32)))
+  min_stddev = math_ops.rsqrt(math_ops.cast(num_pixels, dtypes.float32))
   pixel_value_scale = math_ops.maximum(stddev, min_stddev)
   pixel_value_offset = image_mean
 
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 0c8824d10a9..30e2b494b35 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import itertools
 import math
 import os
 
@@ -34,6 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
 
 
 class RGBToHSVTest(test_util.TensorFlowTestCase):
@@ -68,11 +68,10 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
     data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     for nptype in [np.float32, np.float64]:
       rgb_np = np.array(data, dtype=nptype).reshape([2, 2, 3]) / 255.
-      for use_gpu in [True, False]:
-        with self.test_session(use_gpu=use_gpu):
-          hsv = image_ops.rgb_to_hsv(rgb_np)
-          rgb = image_ops.hsv_to_rgb(hsv)
-          rgb_tf = rgb.eval()
+      with self.test_session():
+        hsv = image_ops.rgb_to_hsv(rgb_np)
+        rgb = image_ops.hsv_to_rgb(hsv)
+        rgb_tf = rgb.eval()
       self.assertAllClose(rgb_tf, rgb_np)
 
 
@@ -234,65 +233,59 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
   def testIdempotentLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = constant_op.constant(x_np, shape=x_np.shape)
-        y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
-        y_tf = y.eval()
-        self.assertAllEqual(y_tf, x_np)
+    with self.test_session():
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, x_np)
 
   def testLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
 
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = constant_op.constant(x_np, shape=x_np.shape)
-        y = image_ops.flip_left_right(x_tf)
-        y_tf = y.eval()
-        self.assertAllEqual(y_tf, y_np)
+    with self.test_session():
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_left_right(x_tf)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
 
   def testIdempotentUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = constant_op.constant(x_np, shape=x_np.shape)
-        y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
-        y_tf = y.eval()
-        self.assertAllEqual(y_tf, x_np)
+    with self.test_session():
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, x_np)
 
   def testUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = constant_op.constant(x_np, shape=x_np.shape)
-        y = image_ops.flip_up_down(x_tf)
-        y_tf = y.eval()
-        self.assertAllEqual(y_tf, y_np)
+    with self.test_session():
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_up_down(x_tf)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
 
   def testIdempotentTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = constant_op.constant(x_np, shape=x_np.shape)
-        y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
-        y_tf = y.eval()
-        self.assertAllEqual(y_tf, x_np)
+    with self.test_session():
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, x_np)
 
   def testTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[1, 4], [2, 5], [3, 6]], dtype=np.uint8).reshape([3, 2, 1])
 
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = constant_op.constant(x_np, shape=x_np.shape)
-        y = image_ops.transpose_image(x_tf)
-        y_tf = y.eval()
-        self.assertAllEqual(y_tf, y_np)
+    with self.test_session():
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.transpose_image(x_tf)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
 
   def testPartialShapes(self):
     p_unknown_rank = array_ops.placeholder(dtypes.uint8)
@@ -323,17 +316,16 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        rotated = image
-        for _ in xrange(4):
-          rotated = image_ops.rot90(rotated)
-        self.assertAllEqual(image, rotated.eval())
+    with self.test_session():
+      rotated = image
+      for _ in xrange(4):
+        rotated = image_ops.rot90(rotated)
+      self.assertAllEqual(image, rotated.eval())
 
   def testRot90NumpyEquivalence(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
-    for use_gpu, k in itertools.product([False, True], range(4)):
-      with self.test_session(use_gpu=use_gpu):
+    for k in range(4):
+      with self.test_session():
         y_np = np.rot90(image, k=k)
         y_tf = image_ops.rot90(image, k=k)
         self.assertAllEqual(y_np, y_tf.eval())
@@ -377,12 +369,11 @@ class RandomFlipTest(test_util.TensorFlowTestCase):
 class AdjustContrastTest(test_util.TensorFlowTestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
-    for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
-        x = constant_op.constant(x_np, shape=x_np.shape)
-        y = image_ops.adjust_contrast(x, contrast_factor)
-        y_tf = y.eval()
-        self.assertAllClose(y_tf, y_np, 1e-6)
+    with self.test_session():
+      x = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.adjust_contrast(x, contrast_factor)
+      y_tf = y.eval()
+      self.assertAllClose(y_tf, y_np, 1e-6)
 
   def testDoubleContrastUint8(self):
     x_shape = [1, 2, 2, 3]
@@ -975,12 +966,12 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
   TYPES = [np.uint8, np.int8, np.int16, np.int32, np.int64,
            np.float32, np.float64]
 
-  def availableGPUModes(self, opt, nptype):
+  def shouldRunOnGPU(self, opt, nptype):
     if opt == image_ops.ResizeMethod.NEAREST_NEIGHBOR \
             and nptype in [np.float32, np.float64]:
-      return [True, False]
+      return True
     else:
-      return [False]
+      return False
 
   def testNoOp(self):
     img_shape = [1, 6, 4, 1]
@@ -1000,8 +991,8 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       img_np = np.array(data, dtype=nptype).reshape(img_shape)
 
       for opt in self.OPTIONS:
-        for use_gpu in self.availableGPUModes(opt, nptype):
-          with self.test_session(use_gpu=use_gpu) as sess:
+        if test.is_gpu_available() and self.shouldRunOnGPU(opt, nptype):
+          with self.test_session() as sess:
             image = constant_op.constant(img_np, shape=img_shape)
             y = image_ops.resize_images(image, target_height, target_width, opt)
             yshape = array_ops.shape(y)
@@ -1097,8 +1088,8 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         img_np = np.array(data, dtype=nptype).reshape(img_shape)
 
         for opt in self.OPTIONS:
-          for use_gpu in self.availableGPUModes(opt, nptype):
-            with self.test_session(use_gpu=use_gpu):
+          if test.is_gpu_available() and self.shouldRunOnGPU(opt, nptype):
+            with self.test_session():
               image = constant_op.constant(img_np, shape=img_shape)
               y = image_ops.resize_images(image, target_height, target_width, opt)
               expected = np.array(expected_data).reshape(target_shape)
@@ -1140,8 +1131,8 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image_ops.ResizeMethod.BILINEAR,
           image_ops.ResizeMethod.NEAREST_NEIGHBOR,
           image_ops.ResizeMethod.AREA]:
-        for use_gpu in self.availableGPUModes(opt, nptype):
-          with self.test_session(use_gpu=use_gpu):
+        if test.is_gpu_available() and self.shouldRunOnGPU(opt, nptype):
+          with self.test_session():
             img_np = np.array(data, dtype=nptype).reshape(img_shape)
             image = constant_op.constant(img_np, shape=img_shape)
             y = image_ops.resize_images(image, target_height, target_width, opt)
@@ -1207,25 +1198,29 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
 
   def testCompareNearestNeighbor(self):
-    input_shape = [1, 5, 6, 3]
-    target_height = 8
-    target_width = 12
-    for nptype in [np.float32, np.float64]:
-      for align_corners in [True, False]:
-        img_np = np.arange(0, np.prod(input_shape), dtype=nptype).reshape(input_shape)
-        with self.test_session(use_gpu=True):
-          image = constant_op.constant(img_np, shape=input_shape)
-          out_op = image_ops.resize_images(image, target_height, target_width,
-                                           image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-                                           align_corners=align_corners)
-          gpu_val = out_op.eval()
-        with self.test_session(use_gpu=False):
-          image = constant_op.constant(img_np, shape=input_shape)
-          out_op = image_ops.resize_images(image, target_height, target_width,
-                                           image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-                                           align_corners=align_corners)
-          cpu_val = out_op.eval()
-        self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
+    if test.is_gpu_available():
+      input_shape = [1, 5, 6, 3]
+      target_height = 8
+      target_width = 12
+      for nptype in [np.float32, np.float64]:
+        for align_corners in [True, False]:
+          img_np = np.arange(
+              0, np.prod(input_shape), dtype=nptype).reshape(input_shape)
+          with self.test_session(use_gpu=True):
+            image = constant_op.constant(img_np, shape=input_shape)
+            out_op = image_ops.resize_images(
+                image, target_height, target_width,
+                image_ops.ResizeMethod.NEAREST_NEIGHBOR,
+                align_corners=align_corners)
+            gpu_val = out_op.eval()
+          with self.test_session(use_gpu=False):
+            image = constant_op.constant(img_np, shape=input_shape)
+            out_op = image_ops.resize_images(
+                image, target_height, target_width,
+                image_ops.ResizeMethod.NEAREST_NEIGHBOR,
+                align_corners=align_corners)
+            cpu_val = out_op.eval()
+          self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
 
 
 class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 67fadc12cdc..7c102390432 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -32,6 +32,12 @@ from tensorflow.python.ops import math_ops
 
 ops.NoGradient("CholeskyGrad")
 ops.NoGradient("BatchCholeskyGrad")
+ops.NoGradient("SelfAdjointEig")
+ops.NoGradient("BatchSelfAdjointEig")
+ops.NoGradient("SelfAdjointEigV2")
+ops.NoGradient("BatchSelfAdjointEigV2")
+ops.NoGradient("Svd")
+ops.NoGradient("BatchSvd")
 
 
 @ops.RegisterGradient("MatrixInverse")
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 0e76f772caf..9d11cbfc873 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -27,27 +27,32 @@ from tensorflow.python.ops.gen_linalg_ops import *
 # pylint: enable=wildcard-import
 
 
+def _UnchangedSquareHelper(input_shape):
+  """Helper for {Batch}UnchangedSquare."""
+  # The matrices in the batch must be square.
+  input_shape[-1].assert_is_compatible_with(input_shape[-2])
+  return [input_shape]
+
+
 @ops.RegisterShape("Cholesky")
 @ops.RegisterShape("CholeskyGrad")
 @ops.RegisterShape("MatrixInverse")
 def _UnchangedSquare(op):
-  input_shape = op.inputs[0].get_shape().with_rank(2)
-  # The matrix must be square.
-  input_shape[0].assert_is_compatible_with(input_shape[1])
-  return [input_shape]
+  """Shape function for matrix ops with output equal to input shape."""
+  return _UnchangedSquareHelper(op.inputs[0].get_shape().with_rank(2))
 
 
 @ops.RegisterShape("BatchCholesky")
 @ops.RegisterShape("BatchCholeskyGrad")
 @ops.RegisterShape("BatchMatrixInverse")
 def _BatchUnchangedSquare(op):
-  input_shape = op.inputs[0].get_shape().with_rank_at_least(2)
-  # The matrices in the batch must be square.
-  input_shape[-1].assert_is_compatible_with(input_shape[-2])
-  return [input_shape]
+  """Shape function for batch matrix ops with output equal to input shape."""
+  return _UnchangedSquareHelper(op.inputs[0].get_shape().with_rank_at_least(2))
+
 
 @ops.RegisterShape("MatrixDeterminant")
 def _MatrixDeterminantShape(op):
+  """Shape function for determinant op."""
   input_shape = op.inputs[0].get_shape().with_rank(2)
   # The matrix must be square.
   input_shape[0].assert_is_compatible_with(input_shape[1])
@@ -59,6 +64,7 @@ def _MatrixDeterminantShape(op):
 
 @ops.RegisterShape("BatchMatrixDeterminant")
 def _BatchMatrixDeterminantShape(op):
+  """Shape function for batch determinant op."""
   input_shape = op.inputs[0].get_shape().with_rank_at_least(2)
   # The matrices in the batch must be square.
   input_shape[-1].assert_is_compatible_with(input_shape[-2])
@@ -70,6 +76,7 @@ def _BatchMatrixDeterminantShape(op):
 
 @ops.RegisterShape("SelfAdjointEig")
 def _SelfAdjointEigShape(op):
+  """Shape function for self-adjoint eigensolver op."""
   input_shape = op.inputs[0].get_shape().with_rank(2)
   # The matrix must be square.
   input_shape[0].assert_is_compatible_with(input_shape[1])
@@ -80,6 +87,7 @@ def _SelfAdjointEigShape(op):
 
 @ops.RegisterShape("BatchSelfAdjointEig")
 def _BatchSelfAdjointEigShape(op):
+  """Shape function for batch self-adjoint eigensolver op."""
   input_shape = op.inputs[0].get_shape().with_rank_at_least(2)
   # The matrices in the batch must be square.
   input_shape[-1].assert_is_compatible_with(input_shape[-2])
@@ -89,48 +97,113 @@ def _BatchSelfAdjointEigShape(op):
   return [out_shape]
 
 
+def _SelfAdjointEigV2ShapeHelper(op, input_shape):
+  """Shape inference helper for {Batch}SelfAdjointEigV2."""
+  batch_shape = input_shape[:-2]
+  n = input_shape[-1].merge_with(input_shape[-2])
+  compute_v = op.get_attr("compute_v")
+  if compute_v:
+    return [batch_shape.concatenate([n]), batch_shape.concatenate([n, n])]
+  else:
+    return [batch_shape.concatenate([n]), [0]]
+
+
+@ops.RegisterShape("SelfAdjointEigV2")
+def _SelfAdjointEigShapeV2(op):
+  """Shape function for SelfAdjointEigV2."""
+  return _SelfAdjointEigV2ShapeHelper(op, op.inputs[0].get_shape().with_rank(2))
+
+
+@ops.RegisterShape("BatchSelfAdjointEigV2")
+def _BatchSelfAdjointEigV2Shape(op):
+  """Shape function for BatchSelfAdjointEigV2."""
+  return _SelfAdjointEigV2ShapeHelper(
+      op, op.inputs[0].get_shape().with_rank_at_least(2))
+
+
+def _SvdShapeHelper(input_shape, op):
+  """Shape inference helper for {Batch}SVD op."""
+  unknown = tensor_shape.unknown_shape()
+  if input_shape.ndims is not None:
+    return [unknown, unknown, unknown]
+  compute_uv = op.get_attr("compute_uv")
+  full_matrices = op.get_attr("full_matrices")
+  m = input_shape[-2]
+  n = input_shape[-1]
+  p = min(m, n)
+  batch_shape = input_shape[:-2]
+  s_shape = batch_shape.concatenate([p])
+  if compute_uv:
+    if full_matrices:
+      u_shape = batch_shape.concatenate([m, m])
+      v_shape = batch_shape.concatenate([n, n])
+    else:
+      u_shape = batch_shape.concatenate([m, p])
+      v_shape = batch_shape.concatenate([n, p])
+  else:
+    u_shape = [0]
+    v_shape = [0]
+  return [s_shape, u_shape, v_shape]
+
+
+@ops.RegisterShape("Svd")
+def _SvdShape(op):
+  """Shape function for SVD op."""
+  return _SvdShapeHelper(op.inputs[0].get_shape().with_rank(2), op)
+
+
+@ops.RegisterShape("BatchSvd")
+def _BatchSvdShape(op):
+  """Shape function for batch SVD op."""
+  return _SvdShapeHelper(op.inputs[0].get_shape().with_rank_at_least(2), op)
+
+
+def _SquareMatrixSolveShapeHelper(lhs_shape, rhs_shape):
+  """Shape inference helper function for square matrix solver ops."""
+  # The matrix must be square.
+  lhs_shape[-1].assert_is_compatible_with(lhs_shape[-2])
+  # The matrix and right-hand side must have the same number of rows.
+  lhs_shape[-2].assert_is_compatible_with(rhs_shape[-2])
+  return [rhs_shape]
+
+
 @ops.RegisterShape("MatrixSolve")
 @ops.RegisterShape("MatrixTriangularSolve")
 def _SquareMatrixSolveShape(op):
-  lhs_shape = op.inputs[0].get_shape().with_rank(2)
-  rhs_shape = op.inputs[1].get_shape().with_rank(2)
-  # The matrix must be square.
-  lhs_shape[0].assert_is_compatible_with(lhs_shape[1])
-  # The matrix and right-hand side must have the same number of rows.
-  lhs_shape[0].assert_is_compatible_with(rhs_shape[0])
-  return [rhs_shape]
+  """Shape function for square matrix solver ops."""
+  return _SquareMatrixSolveShapeHelper(op.inputs[0].get_shape().with_rank(2),
+                                       op.inputs[1].get_shape().with_rank(2))
 
 
 @ops.RegisterShape("BatchMatrixSolve")
 @ops.RegisterShape("BatchMatrixTriangularSolve")
 def _BatchSquareMatrixSolveShape(op):
-  lhs_shape = op.inputs[0].get_shape().with_rank_at_least(2)
-  rhs_shape = op.inputs[1].get_shape().with_rank_at_least(2)
-  # The matrices must be square.
-  lhs_shape[-1].assert_is_compatible_with(lhs_shape[-2])
-  # The matrices and right-hand sides in the batch must have the same number of
-  # rows.
+  """Shape function for batch square matrix solver ops."""
+  return _SquareMatrixSolveShapeHelper(
+      op.inputs[0].get_shape().with_rank_at_least(2),
+      op.inputs[1].get_shape().with_rank_at_least(2))
+
+
+def _MatrixSolveLsShapeHelper(lhs_shape, rhs_shape):
+  """Shape inference helper function for least squares matrix solver ops."""
+  # The matrices and right-hand sides must have the same number of rows.
   lhs_shape[-2].assert_is_compatible_with(rhs_shape[-2])
-  return [rhs_shape]
+  return [lhs_shape[:-2].concatenate([lhs_shape[-1], rhs_shape[-1]])]
 
 
 @ops.RegisterShape("MatrixSolveLs")
 def _MatrixSolveLsShape(op):
-  lhs_shape = op.inputs[0].get_shape().with_rank(2)
-  rhs_shape = op.inputs[1].get_shape().with_rank(2)
-  # The matrix and right-hand side must have the same number of rows.
-  lhs_shape[0].assert_is_compatible_with(rhs_shape[0])
-  return [[lhs_shape[1], rhs_shape[1]]]
+  """Shape function for least-squares matrix solver op."""
+  return _MatrixSolveLsShapeHelper(op.inputs[0].get_shape().with_rank(2),
+                                   op.inputs[1].get_shape().with_rank(2))
 
 
 @ops.RegisterShape("BatchMatrixSolveLs")
 def _BatchMatrixSolveLsShape(op):
-  lhs_shape = op.inputs[0].get_shape().with_rank_at_least(2)
-  rhs_shape = op.inputs[1].get_shape().with_rank_at_least(2)
-  # The matrices and right-hand sides in the batch must have the same number of
-  # rows.
-  lhs_shape[-2].assert_is_compatible_with(rhs_shape[-2])
-  return [lhs_shape[:-2].concatenate([lhs_shape[-1], rhs_shape[-1]])]
+  """Shape function for batch least-squares matrix solver op."""
+  return _MatrixSolveLsShapeHelper(
+      op.inputs[0].get_shape().with_rank_at_least(2),
+      op.inputs[1].get_shape().with_rank_at_least(2))
 
 
 # Names below are lower_case.
@@ -331,4 +404,163 @@ def batch_matrix_solve_ls(matrix,
                                               fast=fast,
                                               name=name)
 
+
+def self_adjoint_eig(matrix, name=None):
+  """Computes the eigen decomposition of a self-adjoint matrix.
+
+  Computes the eigenvalues and eigenvectors of an N-by-N matrix `matrix` such
+  that `matrix * v[:,i] = e(i) * v[:,i]`, for i=0...N-1.
+
+  Args:
+    matrix: `Tensor` of shape `[N, N]`.
+    name: string, optional name of the operation.
+
+  Returns:
+    e: Eigenvalues. Shape is `[N]`.
+    v: Eigenvectors. Shape is `[N, N]`. The columns contain the eigenvectors of
+      `matrix`.
+  """
+  e, v = gen_linalg_ops.self_adjoint_eig_v2(matrix, compute_v=True, name=name)
+  return e, v
+
+
+def batch_self_adjoint_eig(tensor, name=None):
+  """Computes the eigen decomposition of a batch of self-adjoint matrices.
+
+  Computes the eigenvalues and eigenvectors of the innermost N-by-N matrices
+  in `tensor` such that
+  `tensor[...,:,:] * v[..., :,i] = e(..., i) * v[...,:,i]`, for i=0...N-1.
+
+  Args:
+    tensor: `Tensor` of shape `[..., N, N]`.
+    name: string, optional name of the operation.
+
+  Returns:
+    e: Eigenvalues. Shape is `[..., N]`.
+    v: Eigenvectors. Shape is `[..., N, N]`. The columns of the inner most
+    matrices
+      contain eigenvectors of the corresponding matrices in `tensor`
+  """
+  e, v = gen_linalg_ops.batch_self_adjoint_eig_v2(
+      tensor, compute_v=True, name=name)
+  return e, v
+
+
+def self_adjoint_eigvals(matrix, name=None):
+  """Computes the eigenvalues a self-adjoint  matrix.
+
+  Args:
+    matrix: `Tensor` of shape `[N, N]`.
+    name: string, optional name of the operation.
+
+  Returns:
+    e: Eigenvalues of `matrix`. Shape is `[N]`.
+  """
+  e, _ = gen_linalg_ops.self_adjoint_eig_v2(matrix, compute_v=False, name=name)
+  return e
+
+
+def batch_self_adjoint_eigvals(tensor, name=None):
+  """Computes the eigenvalues of a batch of self-adjoint matrices.
+
+  Args:
+    tensor: `Tensor` of shape `[..., N, N]`.
+    name: string, optional name of the operation.
+
+  Returns:
+    e: Eigenvalues. Shape is `[..., N]`. The vector `e[..., :]` contains the `N`
+      eigenvalues of `tensor[..., :, :]`.
+  """
+  e, _ = gen_linalg_ops.batch_self_adjoint_eig_v2(
+      tensor, compute_v=False, name=name)
+  return e
+
+
+def svd(matrix, compute_uv=True, full_matrices=False, name=None):
+  """Computes the singular value decomposition of a matrix.
+
+  Computes the SVD of `matrix` such that `matrix = u * diag(s) *
+  transpose(v)`
+
+  ```prettyprint
+  # a is a matrix.
+  # s is a vector of singular values.
+  # u is the matrix of left singular vectors.
+  # v is a matrix of right singular vectors.
+  s, u, v = svd(a)
+  s = svd(a, compute_uv=False)
+  ```
+
+  Args:
+    matrix: `Tensor` of shape `[M, N]`. Let `P` be the minimum of `M` and `N`.
+    compute_uv: If `True` then left and right singular vectors will be
+      computed and returned in `u` and `v`, respectively. Otherwise, only the
+      singular values will be computed, which can be significantly faster.
+    full_matrices: If true, compute full-sized `u` and `v`. If false
+      (the default), compute only the leading `P` singular vectors.
+      Ignored if `compute_uv` is `False`.
+    name: string, optional name of the operation.
+
+  Returns:
+    s: Singular values. Shape is `[P]`.
+    u: Right singular vectors. If `full_matrices` is `False` (default) then
+      shape is `[M, P]`; if `full_matrices` is `True` then shape is
+      `[M, M]`. Not returned if `compute_uv` is `False`.
+    v: Left singular vectors. If `full_matrices` is `False` (default) then
+      shape is `[N, P]`. If `full_matrices` is `True` then shape is
+      `[N, N]`. Not returned if `compute_uv` is `False`.
+  """
+  s, u, v = gen_linalg_ops.svd(matrix,
+                               compute_uv=compute_uv,
+                               full_matrices=full_matrices)
+  if compute_uv:
+    return s, u, v
+  else:
+    return s
+
+
+def batch_svd(tensor, compute_uv=True, full_matrices=False, name=None):
+  """Computes the singular value decompositions of a batch of matrices.
+
+  Computes the SVD of each inner matrix in `tensor` such that
+  `tensor[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :,
+  :])`
+
+  ```prettyprint
+  # a is a tensor.
+  # s is a tensor of singular values.
+  # u is a tensor of left singular vectors.
+  # v is a tensor of right singular vectors.
+  s, u, v = batch_svd(a)
+  s = batch_svd(a, compute_uv=False)
+  ```
+
+  Args:
+    matrix: `Tensor` of shape `[..., M, N]`. Let `P` be the minimum of `M` and
+      `N`.
+    compute_uv: If `True` then left and right singular vectors will be
+      computed and returned in `u` and `v`, respectively. Otherwise, only the
+      singular values will be computed, which can be significantly faster.
+    full_matrices: If true, compute full-sized `u` and `v`. If false
+      (the default), compute only the leading `P` singular vectors.
+      Ignored if `compute_uv` is `False`.
+    name: string, optional name of the operation.
+
+  Returns:
+    s: Singular values. Shape is `[..., P]`.
+    u: Right singular vectors. If `full_matrices` is `False` (default) then
+      shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+      `[..., M, M]`. Not returned if `compute_uv` is `False`.
+    v: Left singular vectors. If `full_matrices` is `False` (default) then
+      shape is `[..., N, P]`. If `full_matrices` is `True` then shape is
+      `[..., N, N]`. Not returned if `compute_uv` is `False`.
+  """
+  s, u, v = gen_linalg_ops.batch_svd(
+      tensor, compute_uv=compute_uv, full_matrices=full_matrices)
+  if compute_uv:
+    return s, u, v
+  else:
+    return s
+
+
 # pylint: enable=invalid-name
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 0620a3da2c4..2331f21d479 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -161,7 +161,7 @@ def _SegmentMeanGrad(op, grad):
           array_ops.fill(array_ops.expand_dims(input_rank - 1, 0), 1)])
   ones = array_ops.fill(ones_shape,
                         constant_op.constant(1, dtype=grad.dtype))
-  scaled_grad = grad * math_ops.inv(math_ops.segment_sum(ones, op.inputs[1]))
+  scaled_grad = math_ops.div(grad, math_ops.segment_sum(ones, op.inputs[1]))
   return array_ops.gather(scaled_grad, op.inputs[1]), None
 
 
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index fc7b299978a..981218bd8ba 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -98,9 +98,6 @@ functions on matrices to your graph.
 @@cholesky_solve
 @@batch_cholesky_solve
 
-@@self_adjoint_eig
-@@batch_self_adjoint_eig
-
 @@matrix_solve
 @@batch_matrix_solve
 
@@ -110,6 +107,14 @@ functions on matrices to your graph.
 @@matrix_solve_ls
 @@batch_matrix_solve_ls
 
+@@self_adjoint_eig
+@@batch_self_adjoint_eig
+@@self_adjoint_eigvals
+@@batch_self_adjoint_eigvals
+
+@@svd
+@@batch_svd
+
 ## Complex Number Functions
 
 TensorFlow provides several operations that you can use to add complex number
@@ -422,7 +427,8 @@ def scalar_mul(scalar, x):
   Raises:
     ValueError: if scalar is not a 0-D `scalar`.
   """
-  scalar = ops.convert_to_tensor(scalar, dtype=x.dtype, name="scalar")
+  scalar = ops.convert_to_tensor(scalar, dtype=x.dtype.base_dtype,
+                                 name="scalar")
   shape = scalar.get_shape()
   if shape.ndims == 0:
     if isinstance(x, ops.IndexedSlices):
@@ -1597,91 +1603,93 @@ def tanh(x, name=None):
 
 
 def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
-    """Compute the cumulative sum of the tensor `x` along `axis`.
+  """Compute the cumulative sum of the tensor `x` along `axis`.
 
-    By default, this op performs an inclusive cumsum, which means that the first
-    element of the input is identical to the first element of the output:
-    ```prettyprint
-    tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
-    ```
+  By default, this op performs an inclusive cumsum, which means that the first
+  element of the input is identical to the first element of the output:
+  ```prettyprint
+  tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c]
+  ```
 
-    By setting the `exclusive` kwarg to `True`, an exclusive cumsum is performed
-    instead:
-    ```prettyprint
-    tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
-    ```
+  By setting the `exclusive` kwarg to `True`, an exclusive cumsum is performed
+  instead:
+  ```prettyprint
+  tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b]
+  ```
 
-    By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-    opposite direction:
-    ```prettyprint
-    tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
-    ```
-    This is more efficient than using separate `tf.reverse` ops.
+  By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+  opposite direction:
+  ```prettyprint
+  tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c]
+  ```
+  This is more efficient than using separate `tf.reverse` ops.
 
-    The `reverse` and `exclusive` kwargs can also be combined:
-    ```prettyprint
-    tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
-    ```
+  The `reverse` and `exclusive` kwargs can also be combined:
+  ```prettyprint
+  tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0]
+  ```
 
-    Args:
-      x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+  Args:
+    x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
        `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
        `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-      axis: A `Tensor` of type `int32` (default: 0).
-      reverse: A `bool` (default: False).
-      name: A name for the operation (optional).
+       axis: A `Tensor` of type `int32` (default: 0).
+       reverse: A `bool` (default: False).
+       name: A name for the operation (optional).
 
-    Returns:
-      A `Tensor`. Has the same type as `x`.
-    """
-    with ops.op_scope([x], name, "Cumsum") as name:
-      x = ops.convert_to_tensor(x, name="x")
-      return gen_math_ops.cumsum(x, axis, exclusive=exclusive,
-                                 reverse=reverse, name=name)
+  Returns:
+    A `Tensor`. Has the same type as `x`.
+  """
+  with ops.op_scope([x], name, "Cumsum") as name:
+    x = ops.convert_to_tensor(x, name="x")
+    return gen_math_ops.cumsum(
+        x, axis, exclusive=exclusive, reverse=reverse, name=name)
 
 
 def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
-    """Compute the cumulative product of the tensor `x` along `axis`.
+  """Compute the cumulative product of the tensor `x` along `axis`.
 
-    By default, this op performs an inclusive cumprod, which means that the first
-    element of the input is identical to the first element of the output:
-    ```prettyprint
-    tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
-    ```
+  By default, this op performs an inclusive cumprod, which means that the
+  first
+  element of the input is identical to the first element of the output:
+  ```prettyprint
+  tf.cumprod([a, b, c]) ==> [a, a * b, a * b * c]
+  ```
 
-    By setting the `exclusive` kwarg to `True`, an exclusive cumprod is performed
-    instead:
-    ```prettyprint
-    tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
-    ```
+  By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+  performed
+  instead:
+  ```prettyprint
+  tf.cumprod([a, b, c], exclusive=True) ==> [0, a, a * b]
+  ```
 
-    By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-    opposite direction:
-    ```prettyprint
-    tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
-    ```
-    This is more efficient than using separate `tf.reverse` ops.
+  By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+  opposite direction:
+  ```prettyprint
+  tf.cumprod([a, b, c], reverse=True) ==> [a * b * c, b * c, c]
+  ```
+  This is more efficient than using separate `tf.reverse` ops.
 
-    The `reverse` and `exclusive` kwargs can also be combined:
-    ```prettyprint
-    tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
-    ```
+  The `reverse` and `exclusive` kwargs can also be combined:
+  ```prettyprint
+  tf.cumprod([a, b, c], exclusive=True, reverse=True) ==> [b * c, c, 0]
+  ```
 
-    Args:
-      x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+  Args:
+    x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
        `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
        `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-      axis: A `Tensor` of type `int32` (default: 0).
-      reverse: A `bool` (default: False).
-      name: A name for the operation (optional).
+    axis: A `Tensor` of type `int32` (default: 0).
+    reverse: A `bool` (default: False).
+    name: A name for the operation (optional).
 
-    Returns:
-      A `Tensor`. Has the same type as `x`.
-    """
-    with ops.op_scope([x], name, "Cumprod") as name:
-      x = ops.convert_to_tensor(x, name="x")
-      return gen_math_ops.cumprod(x, axis, exclusive=exclusive,
-                                  reverse=reverse, name=name)
+  Returns:
+    A `Tensor`. Has the same type as `x`.
+  """
+  with ops.op_scope([x], name, "Cumprod") as name:
+    x = ops.convert_to_tensor(x, name="x")
+    return gen_math_ops.cumprod(
+        x, axis, exclusive=exclusive, reverse=reverse, name=name)
 
 
 ops.RegisterShape("Abs")(common_shapes.unchanged_shape)
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index fdcae795dcc..a793f4968cc 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -21,8 +21,11 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 exp = np.exp
@@ -92,5 +95,36 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase):
         z_tf = math_ops.squared_difference(x, y).eval()
         self.assertAllClose(z, z_tf)
 
+class ScalarMulTest(test_util.TensorFlowTestCase):
+  def testAcceptsRefs(self):
+    var = variables.Variable(10)
+    result = math_ops.scalar_mul(3, var)
+    init = variables.initialize_all_variables()
+    with self.test_session() as sess:
+      sess.run(init)
+      self.assertEqual(30, result.eval())
+
+  def testAcceptsConstant(self):
+    const = constant_op.constant(10)
+    result = math_ops.scalar_mul(3, const)
+    with self.test_session():
+      self.assertEqual(30, result.eval())
+
+  def testAcceptsTensor(self):
+    tensor = array_ops.ones([10, 10])
+    result = math_ops.scalar_mul(3, tensor)
+    expected = array_ops.ones([10, 10]) * 3
+
+    with self.test_session():
+      self.assertAllEqual(expected.eval(), result.eval())
+
+  def testAcceptsIndexedSlices(self):
+    values = constant_op.constant([2, 3, 5, 7, 0, -1], shape=[3, 2])
+    indices = constant_op.constant([0, 2, 5])
+    x = math_ops.scalar_mul(-3, ops.IndexedSlices(values, indices))
+    with self.test_session():
+      self.assertAllEqual(x.values.eval(), [[-6, -9], [-15, -21], [0, 3]])
+      self.assertAllEqual(x.indices.eval(), [0, 2, 5])
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index 064666a27fa..9ed801dcc5a 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -189,6 +189,7 @@ These can be used for measuring accuracy of a network in a regression task
 or for regularization purposes (weight decay).
 
 @@l2_loss
+@@log_poisson_loss
 
 ## Classification
 
@@ -273,6 +274,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import math
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
@@ -305,9 +307,74 @@ from tensorflow.python.ops.nn_ops import *
 from tensorflow.python.ops.candidate_sampling_ops import *
 from tensorflow.python.ops.embedding_ops import *
 from tensorflow.python.ops.rnn import *
+
 # pylint: enable=wildcard-import
 
 
+def log_poisson_loss(log_input, targets, compute_full_loss=False, name=None):
+  """Computes log poisson loss given `log_input`.
+
+  Gives the log-likelihood loss between the prediction and the target under the
+  assumption that the target has a poisson distribution.
+  Caveat: By default, this is not the exact loss, but the loss minus a
+    constant term [log(z!)]. That has no effect for optimization, but
+    does not play well with relative loss comparisons. To compute an
+    approximation of the log factorial term, specify
+    compute_full_loss=True to enable Stirling's Approximation.
+
+  For brevity, let `c = log(x) = log_input`, `z = targets`.  The log poisson
+  loss is
+
+        -log(exp(-x) * (x^z) / z!)
+      = -log(exp(-x) * (x^z)) + log(z!)
+      ~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
+          [ Note the second term is the Stirling's Approximation for log(z!).
+            It is invariant to x and does not affect optimization, though
+            important for correct relative loss comparisons. It is only
+            computed when compute_full_loss == True. ]
+      = x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
+      = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
+
+  Args:
+    log_input: A `Tensor` of type `float32` or `float64`.
+    targets: A `Tensor` of the same type and shape as `log_input`.
+    compute_full_loss: whether to compute the full loss. If false, a constant
+      term is dropped in favor of more efficient optimization.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of the same shape as `log_input` with the componentwise
+    logistic losses.
+
+  Raises:
+    ValueError: If `log_input` and `targets` do not have the same shape.
+  """
+  with ops.op_scope([log_input, targets], name, "log_poisson_loss") as name:
+    log_input = ops.convert_to_tensor(log_input, name="log_input")
+    targets = ops.convert_to_tensor(targets, name="targets")
+    try:
+      targets.get_shape().merge_with(log_input.get_shape())
+    except ValueError:
+      raise ValueError(
+          "log_input and targets must have the same shape (%s vs %s)" %
+          (log_input.get_shape(), targets.get_shape()))
+
+    result = math_ops.exp(log_input) - log_input * targets
+    if compute_full_loss:
+      # need to create constant tensors here so that their dtypes can be matched
+      # to that of the targets.
+      point_five = constant_op.constant(0.5, dtype=targets.dtype)
+      two_pi = constant_op.constant(2 * math.pi, dtype=targets.dtype)
+
+      stirling_approx = (targets * math_ops.log(targets)) - targets + (
+          point_five * math_ops.log(two_pi * targets))
+      zeros = array_ops.zeros_like(targets, dtype=targets.dtype)
+      ones = array_ops.ones_like(targets, dtype=targets.dtype)
+      cond = math_ops.logical_and(targets >= zeros, targets <= ones)
+      result += math_ops.select(cond, zeros, stirling_approx)
+    return result
+
+
 def sigmoid_cross_entropy_with_logits(logits, targets, name=None):
   """Computes sigmoid cross entropy given `logits`.
 
@@ -356,9 +423,8 @@ def sigmoid_cross_entropy_with_logits(logits, targets, name=None):
     try:
       targets.get_shape().merge_with(logits.get_shape())
     except ValueError:
-      raise ValueError(
-          "logits and targets must have the same shape (%s vs %s)"
-          % (logits.get_shape(), targets.get_shape()))
+      raise ValueError("logits and targets must have the same shape (%s vs %s)"
+                       % (logits.get_shape(), targets.get_shape()))
 
     # The logistic loss formula from above is
     #   x - x * z + log(1 + exp(-x))
@@ -377,8 +443,7 @@ def sigmoid_cross_entropy_with_logits(logits, targets, name=None):
                         name=name)
 
 
-def weighted_cross_entropy_with_logits(logits, targets, pos_weight,
-                                       name=None):
+def weighted_cross_entropy_with_logits(logits, targets, pos_weight, name=None):
   """Computes a weighted cross entropy.
 
   This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,
@@ -430,9 +495,8 @@ def weighted_cross_entropy_with_logits(logits, targets, pos_weight,
     try:
       targets.get_shape().merge_with(logits.get_shape())
     except ValueError:
-      raise ValueError(
-          "logits and targets must have the same shape (%s vs %s)"
-          % (logits.get_shape(), targets.get_shape()))
+      raise ValueError("logits and targets must have the same shape (%s vs %s)"
+                       % (logits.get_shape(), targets.get_shape()))
 
     # The logistic loss formula from above is
     #   (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))
@@ -517,8 +581,8 @@ def zero_fraction(value, name=None):
   with ops.op_scope([value], name, "zero_fraction"):
     value = ops.convert_to_tensor(value, name="value")
     zero = constant_op.constant(0, dtype=value.dtype, name="zero")
-    return math_ops.reduce_mean(math_ops.cast(math_ops.equal(value, zero),
-                                              dtypes.float32))
+    return math_ops.reduce_mean(
+        math_ops.cast(math_ops.equal(value, zero), dtypes.float32))
 
 
 # pylint: disable=redefined-builtin,line-too-long
@@ -549,7 +613,8 @@ def depthwise_conv2d(input, filter, strides, padding, name=None):
     strides: 1-D of size 4.  The stride of the sliding window for each
       dimension of `input`.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+      See the [comment
+        here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
     name: A name for this operation (optional).
 
   Returns:
@@ -567,8 +632,8 @@ def depthwise_conv2d(input, filter, strides, padding, name=None):
       if input.get_shape().ndims is not None:
         assert len(input.get_shape()) == 4
         assert input.get_shape()[3] == in_channels, (
-            "Mismatched input depth %d and number of depthwise filters %d." % (
-                input.get_shape()[3].value, in_channels))
+            "Mismatched input depth %d and number of depthwise filters %d." %
+            (input.get_shape()[3].value, in_channels))
     else:
       assert input.get_shape().ndims is not None, (
           "Either tensor must provide static shape information.")
@@ -578,8 +643,8 @@ def depthwise_conv2d(input, filter, strides, padding, name=None):
     if in_channels == 1:
       return nn_ops.conv2d(input, filter, strides, padding, name=name)
     else:
-      return nn_ops.depthwise_conv2d_native(input, filter, strides, padding,
-                                            name=name)
+      return nn_ops.depthwise_conv2d_native(
+          input, filter, strides, padding, name=name)
 # pylint: enable=redefined-builtin,line-too-long
 
 
@@ -617,7 +682,8 @@ def separable_conv2d(input, depthwise_filter, pointwise_filter, strides,
     strides: 1-D of size 4.  The strides for the depthwise convolution for
       each dimension of `input`.
     padding: A string, either `'VALID'` or `'SAME'`.  The padding algorithm.
-      See the [comment here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
+      See the [comment
+        here](https://www.tensorflow.org/api_docs/python/nn.html#convolution)
     name: A name for this operation (optional).
 
   Returns:
@@ -630,10 +696,10 @@ def separable_conv2d(input, depthwise_filter, pointwise_filter, strides,
   with ops.op_scope([input, depthwise_filter, pointwise_filter],
                     name, "separable_conv2d") as name:
     input = ops.convert_to_tensor(input, name="tensor_in")
-    depthwise_filter = ops.convert_to_tensor(depthwise_filter,
-                                             name="depthwise_filter")
-    pointwise_filter = ops.convert_to_tensor(pointwise_filter,
-                                             name="pointwise_filter")
+    depthwise_filter = ops.convert_to_tensor(
+        depthwise_filter, name="depthwise_filter")
+    pointwise_filter = ops.convert_to_tensor(
+        pointwise_filter, name="pointwise_filter")
 
     if pointwise_filter.get_shape().ndims is not None:
       assert len(pointwise_filter.get_shape()) == 4
@@ -653,10 +719,10 @@ def separable_conv2d(input, depthwise_filter, pointwise_filter, strides,
     # The layout of the ops in the graph are expected to be as follows:
     # depthwise_conv2d  // Conv2D op corresponding to native deptwise conv.
     # separable_conv2d  // Conv2D op corresponding to the pointwise conv.
-    depthwise = nn_ops.depthwise_conv2d_native(input, depthwise_filter, strides,
-                                               padding, name="depthwise")
-    return nn_ops.conv2d(depthwise, pointwise_filter, [1, 1, 1, 1],
-                         padding="VALID", name=name)
+    depthwise = nn_ops.depthwise_conv2d_native(
+        input, depthwise_filter, strides, padding, name="depthwise")
+    return nn_ops.conv2d(
+        depthwise, pointwise_filter, [1, 1, 1, 1], padding="VALID", name=name)
 # pylint: enable=redefined-builtin,line-too-long
 
 
@@ -683,29 +749,19 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
     * the (possibly shifted) sum of squares of the elements in the array.
     * the shift by which the mean must be corrected or None if `shift` is None.
   """
-  with ops.op_scope([x, axes, shift], name, "sufficient_statistics"):
+  axes = list(set(axes))
+  with ops.op_scope([x, shift], name, "sufficient_statistics"):
     x = ops.convert_to_tensor(x, name="x")
     x_shape = x.get_shape()
     if x_shape.is_fully_defined():
       counts = 1
-      m_shape = []
-      for d in xrange(x_shape.ndims):
-        dim = x_shape[d].value
-        if d in set(axes):
-          counts *= dim
-          dim = 1
-        m_shape.append(dim)
+      for d in axes:
+        counts *= x_shape[d].value
       counts = constant_op.constant(counts, dtype=x.dtype)
     else:  # shape needs to be inferred at runtime.
-      x_shape = array_ops.shape(x)
-      select_axes = sparse_ops.sparse_to_dense(axes, array_ops.shape(x_shape),
-                                               True, False)
-      m_shape = math_ops.select(select_axes, array_ops.ones_like(x_shape),
-                                x_shape)
+      x_dims = array_ops.gather(array_ops.shape(x), axes)
       counts = math_ops.cast(
-          math_ops.reduce_prod(x_shape / m_shape),
-          x.dtype,
-          name="count")
+          math_ops.reduce_prod(x_dims), x.dtype, name="count")
     if shift is not None:
       shift = ops.convert_to_tensor(shift, name="shift")
       m_ss = math_ops.sub(x, shift)
@@ -742,10 +798,9 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
     else:  # no shift.
       shifted_mean = math_ops.mul(mean_ss, divisor, name="mean")
       mean = shifted_mean
-    variance = math_ops.sub(
-        math_ops.mul(variance_ss, divisor),
-        math_ops.square(shifted_mean),
-        name="variance")
+    variance = math_ops.sub(math_ops.mul(variance_ss, divisor),
+                            math_ops.square(shifted_mean),
+                            name="variance")
   return (mean, variance)
 
 
@@ -782,16 +837,13 @@ def moments(x, axes, shift=None, name=None, keep_dims=False):
     y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x
     shift = math_ops.cast(shift, dtypes.float32) if (
         shift is not None and x.dtype == dtypes.float16) else shift
-    counts, m_ss, v_ss, shift = sufficient_statistics(y,
-                                                      axes,
-                                                      shift=shift,
-                                                      keep_dims=keep_dims,
-                                                      name=name)
+    counts, m_ss, v_ss, shift = sufficient_statistics(
+        y, axes, shift=shift, keep_dims=keep_dims, name=name)
     with ops.control_dependencies([counts, m_ss, v_ss]):
       mean, variance = normalize_moments(counts, m_ss, v_ss, shift, name=name)
       if x.dtype == dtypes.float16:
-        return (math_ops.cast(mean, dtypes.float16), math_ops.cast(
-            variance, dtypes.float16))
+        return (math_ops.cast(mean, dtypes.float16),
+                math_ops.cast(variance, dtypes.float16))
       else:
         return (mean, variance)
 
@@ -848,8 +900,8 @@ def batch_normalization(x,
     inv = math_ops.rsqrt(variance + variance_epsilon)
     if scale is not None:
       inv *= scale
-    return x * inv + (
-        offset - mean * inv if offset is not None else -mean * inv)
+    return x * inv + (offset - mean * inv
+                      if offset is not None else -mean * inv)
 
 
 def batch_norm_with_global_normalization(t,
@@ -902,8 +954,13 @@ def _sum_rows(x):
   return array_ops.reshape(math_ops.matmul(x, ones), [-1])
 
 
-def _compute_sampled_logits(weights, biases, inputs, labels, num_sampled,
-                            num_classes, num_true=1,
+def _compute_sampled_logits(weights,
+                            biases,
+                            inputs,
+                            labels,
+                            num_sampled,
+                            num_classes,
+                            num_true=1,
                             sampled_values=None,
                             subtract_log_q=True,
                             remove_accidental_hits=False,
@@ -955,8 +1012,8 @@ def _compute_sampled_logits(weights, biases, inputs, labels, num_sampled,
   if not isinstance(weights, list):
     weights = [weights]
 
-  with ops.op_scope(
-      weights + [biases, inputs, labels], name, "compute_sampled_logits"):
+  with ops.op_scope(weights + [biases, inputs, labels], name,
+                    "compute_sampled_logits"):
     if labels.dtype != dtypes.int64:
       labels = math_ops.cast(labels, dtypes.int64)
     labels_flat = array_ops.reshape(labels, [-1])
@@ -1018,9 +1075,8 @@ def _compute_sampled_logits(weights, biases, inputs, labels, num_sampled,
     # sampled_w has shape [num_sampled, dim]
     # sampled_b has shape [num_sampled]
     # Apply X*W'+B, which yields [batch_size, num_sampled]
-    sampled_logits = math_ops.matmul(inputs,
-                                     sampled_w,
-                                     transpose_b=True) + sampled_b
+    sampled_logits = math_ops.matmul(
+        inputs, sampled_w, transpose_b=True) + sampled_b
 
     if remove_accidental_hits:
       acc_hits = candidate_sampling_ops.compute_accidental_hits(
@@ -1029,10 +1085,10 @@ def _compute_sampled_logits(weights, biases, inputs, labels, num_sampled,
 
       # This is how SparseToDense expects the indices.
       acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1])
-      acc_ids_2d_int32 = array_ops.reshape(math_ops.cast(
-          acc_ids, dtypes.int32), [-1, 1])
-      sparse_indices = array_ops.concat(
-          1, [acc_indices_2d, acc_ids_2d_int32], "sparse_indices")
+      acc_ids_2d_int32 = array_ops.reshape(
+          math_ops.cast(acc_ids, dtypes.int32), [-1, 1])
+      sparse_indices = array_ops.concat(1, [acc_indices_2d, acc_ids_2d_int32],
+                                        "sparse_indices")
       # Create sampled_logits_shape = [batch_size, num_sampled]
       sampled_logits_shape = array_ops.concat(
           0,
@@ -1040,8 +1096,11 @@ def _compute_sampled_logits(weights, biases, inputs, labels, num_sampled,
       if sampled_logits.dtype != acc_weights.dtype:
         acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype)
       sampled_logits += sparse_ops.sparse_to_dense(
-          sparse_indices, sampled_logits_shape, acc_weights,
-          default_value=0.0, validate_indices=False)
+          sparse_indices,
+          sampled_logits_shape,
+          acc_weights,
+          default_value=0.0,
+          validate_indices=False)
 
     if subtract_log_q:
       # Subtract log of Q(l), prior probability that l appears in sampled.
@@ -1053,14 +1112,19 @@ def _compute_sampled_logits(weights, biases, inputs, labels, num_sampled,
     # true_logits is a float tensor, ones_like(true_logits) is a float tensor
     # of ones. We then divide by num_true to ensure the per-example labels sum
     # to 1.0, i.e. form a proper probability distribution.
-    out_labels = array_ops.concat(
-        1, [array_ops.ones_like(true_logits) / num_true,
-            array_ops.zeros_like(sampled_logits)])
+    out_labels = array_ops.concat(1,
+                                  [array_ops.ones_like(true_logits) / num_true,
+                                   array_ops.zeros_like(sampled_logits)])
 
   return out_logits, out_labels
 
 
-def nce_loss(weights, biases, inputs, labels, num_sampled, num_classes,
+def nce_loss(weights,
+             biases,
+             inputs,
+             labels,
+             num_sampled,
+             num_classes,
              num_true=1,
              sampled_values=None,
              remove_accidental_hits=False,
@@ -1115,23 +1179,32 @@ def nce_loss(weights, biases, inputs, labels, num_sampled, num_classes,
     A `batch_size` 1-D tensor of per-example NCE losses.
   """
   logits, labels = _compute_sampled_logits(
-      weights, biases, inputs, labels, num_sampled, num_classes,
+      weights,
+      biases,
+      inputs,
+      labels,
+      num_sampled,
+      num_classes,
       num_true=num_true,
       sampled_values=sampled_values,
       subtract_log_q=True,
       remove_accidental_hits=remove_accidental_hits,
       partition_strategy=partition_strategy,
       name=name)
-  sampled_losses = sigmoid_cross_entropy_with_logits(logits,
-                                                     labels,
-                                                     name="sampled_losses")
+  sampled_losses = sigmoid_cross_entropy_with_logits(
+      logits, labels, name="sampled_losses")
   # sampled_losses is batch_size x {true_loss, sampled_losses...}
   # We sum out true and sampled losses.
   return _sum_rows(sampled_losses)
 
 
-def sampled_softmax_loss(weights, biases, inputs, labels, num_sampled,
-                         num_classes, num_true=1,
+def sampled_softmax_loss(weights,
+                         biases,
+                         inputs,
+                         labels,
+                         num_sampled,
+                         num_classes,
+                         num_true=1,
                          sampled_values=None,
                          remove_accidental_hits=True,
                          partition_strategy="mod",
@@ -1182,7 +1255,12 @@ def sampled_softmax_loss(weights, biases, inputs, labels, num_sampled,
 
   """
   logits, labels = _compute_sampled_logits(
-      weights, biases, inputs, labels, num_sampled, num_classes,
+      weights,
+      biases,
+      inputs,
+      labels,
+      num_sampled,
+      num_classes,
       num_true=num_true,
       sampled_values=sampled_values,
       subtract_log_q=True,
@@ -1193,7 +1271,6 @@ def sampled_softmax_loss(weights, biases, inputs, labels, num_sampled,
   # sampled_losses is a [batch_size] tensor.
   return sampled_losses
 
-
 # TODO(cwhipkey): sigmoid and tanh should not be exposed from tf.nn.
 __all__ = make_all(__name__)
 __all__.append("zero_fraction")  # documented in training.py
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 73e51aab7de..562c0408b94 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1125,7 +1125,7 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):
                                                dtype=x.dtype)
     # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
     binary_tensor = math_ops.floor(random_tensor)
-    ret = x * math_ops.inv(keep_prob) * binary_tensor
+    ret = math_ops.div(x, keep_prob) * binary_tensor
     ret.set_shape(x.get_shape())
     return ret
 
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 06269054398..b0ab78e68ff 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -83,6 +83,46 @@ class SoftmaxTest(tf.test.TestCase):
     self.assertLess(err, eps)
 
 
+class LogPoissonLossTest(tf.test.TestCase):
+
+  def _log_poisson_loss(self, x, z, compute_full_loss=False):
+    lpl = np.exp(x) - z * x
+    if compute_full_loss:
+      stirling_approx = z * np.log(z) - z + 0.5 * np.log(2. * np.pi * z)
+      lpl += np.ma.masked_array(stirling_approx, mask=(z <= 1)).filled(0.)
+    return lpl
+
+  def testLogPoissonLoss(self):
+    x_shape = [5, 10]
+    x_np = np.random.randn(*x_shape).astype(np.float32)
+    z_np = np.random.randint(0, 5, size=x_shape).astype(np.float32)
+    y_np = self._log_poisson_loss(x_np, z_np, compute_full_loss=False)
+    y_np_stirling = self._log_poisson_loss(x_np, z_np, compute_full_loss=True)
+    with self.test_session():
+      y_tf = tf.nn.log_poisson_loss(x_np, z_np, compute_full_loss=False)
+      y_tf_stirling = tf.nn.log_poisson_loss(x_np, z_np, compute_full_loss=True)
+      y_tf_np = y_tf.eval()
+      y_tf_np_stirling = y_tf_stirling.eval()
+    eps = 1e-3
+    self.assertAllClose(y_tf_np, y_np, eps)
+    self.assertAllClose(y_tf_np_stirling, y_np_stirling, eps)
+
+  def testGradient(self):
+    x_shape = [5, 10]
+    x_np = np.random.randn(*x_shape).astype(np.float64)
+    z_np = np.random.randint(0, 5, size=x_shape).astype(np.float64)
+    with self.test_session():
+      x_tf = tf.constant(x_np)
+      y_tf = tf.nn.log_poisson_loss(x_tf, z_np, compute_full_loss=False)
+      y_tf_stirling = tf.nn.log_poisson_loss(x_tf, z_np, compute_full_loss=True)
+      err = tf.test.compute_gradient_error(x_tf, x_shape, y_tf, x_shape)
+      err_stirling = tf.test.compute_gradient_error(x_tf, x_shape,
+                                                    y_tf_stirling, x_shape)
+    eps = 1e-6
+    self.assertLess(err, eps)
+    self.assertLess(err_stirling, eps)
+
+
 class LogSoftmaxTest(tf.test.TestCase):
 
   def _log_softmax(self, x):
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 94d9b454e19..4f6f8daf621 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -522,7 +524,15 @@ def bidirectional_rnn(cell_fw, cell_bw, inputs,
   if not inputs:
     raise ValueError("inputs must not be empty")
 
-  name = scope or "BiRNN"
+  if scope is None:
+    name = "BiRNN"
+  elif isinstance(scope, six.string_types):
+    name = scope
+  elif isinstance(scope, vs.VariableScope):
+    name = scope.name
+  else:
+    raise TypeError("scope must be a string or an instance of VariableScope")
+
   # Forward direction
   with vs.variable_scope(name + "_FW") as fw_scope:
     output_fw, output_state_fw = rnn(cell_fw, inputs, initial_state_fw, dtype,
@@ -635,7 +645,15 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
   if not isinstance(cell_bw, rnn_cell.RNNCell):
     raise TypeError("cell_bw must be an instance of RNNCell")
 
-  name = scope or "BiRNN"
+  if scope is None:
+    name = "BiRNN"
+  elif isinstance(scope, six.string_types):
+    name = scope
+  elif isinstance(scope, vs.VariableScope):
+    name = scope.name
+  else:
+    raise TypeError("scope must be a string or an instance of VariableScope")
+
   # Forward direction
   with vs.variable_scope(name + "_FW") as fw_scope:
     output_fw, output_state_fw = dynamic_rnn(
@@ -686,8 +704,9 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
   of time steps and batch size, or a (possibly nested) tuple of such tensors,
   matching the nested structure of `cell.output_size`.
 
-  The parameter `sequence_length` is required and dynamic calculation is
-  automatically performed.
+  The parameter `sequence_length` is optional and is used to copy-through state
+  and zero-out outputs when past a batch element's sequence length. So it's more
+  for correctness than performance, unlike in rnn().
 
   Args:
     cell: An instance of RNNCell.
diff --git a/tensorflow/python/ops/rnn_cell.py b/tensorflow/python/ops/rnn_cell.py
index 33ae2ee30bf..1a6ea6fcecd 100644
--- a/tensorflow/python/ops/rnn_cell.py
+++ b/tensorflow/python/ops/rnn_cell.py
@@ -87,6 +87,11 @@ def _state_size_with_prefix(state_size, prefix=None):
 class RNNCell(object):
   """Abstract object representing an RNN cell.
 
+  The definition of cell in this package differs from the definition used in the
+  literature. In the literature, cell refers to an object with a single scalar
+  output. The definition in this package refers to a horizontal array of such
+  units.
+
   An RNN cell, in the most abstract setting, is anything that has
   a state and performs some operation that takes a matrix of inputs.
   This operation results in an output matrix with `self.output_size` columns.
diff --git a/tensorflow/python/ops/seq2seq.py b/tensorflow/python/ops/seq2seq.py
index 99d8daf04c9..8605811b474 100644
--- a/tensorflow/python/ops/seq2seq.py
+++ b/tensorflow/python/ops/seq2seq.py
@@ -560,6 +560,13 @@ def attention_decoder(decoder_inputs, initial_state, attention_states, cell,
     def attention(query):
       """Put attention masks on hidden using hidden_features and query."""
       ds = []  # Results of attention reads will be stored here.
+      if nest.is_sequence(query):  # If the query is a tuple, flatten it.
+        query_list = nest.flatten(query)
+        for q in query_list:  # Check that ndims == 2 if specified.
+          ndims = q.get_shape().ndims
+          if ndims:
+            assert ndims == 2
+        query = array_ops.concat(1, query_list)
       for a in xrange(num_heads):
         with variable_scope.variable_scope("Attention_%d" % a):
           y = linear(query, attention_vec_size, True)
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 139a70fbb2b..7da35219e2a 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -775,7 +775,8 @@ def sparse_to_indicator(sp_input, vocab_size, name=None):
                                   name=name)
 
 
-def sparse_merge(sp_ids, sp_values, vocab_size, name=None):
+def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
+                 already_sorted=False):
   """Combines a batch of feature ids and values into a single `SparseTensor`.
 
   The most common use case for this function occurs when feature ids and
@@ -794,14 +795,17 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None):
 
   For example, consider the following feature vectors:
 
+  ```python
     vector1 = [-3, 0, 0, 0, 0, 0]
     vector2 = [ 0, 1, 0, 4, 1, 0]
     vector3 = [ 5, 0, 0, 9, 0, 0]
+  ```
 
   These might be stored sparsely in the following Example protos by storing
   only the feature ids (column number if the vectors are treated as a matrix)
   of the non-zero elements and the corresponding values:
 
+  ```python
     examples = [Example(features={
                     "ids": Feature(int64_list=Int64List(value=[0])),
                     "values": Feature(float_list=FloatList(value=[-3]))}),
@@ -811,6 +815,7 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None):
                 Example(features={
                     "ids": Feature(int64_list=Int64List(value=[0, 3])),
                     "values": Feature(float_list=FloatList(value=[5, 9]))})]
+  ```
 
   The result of calling parse_example on these examples will produce a
   dictionary with entries for "ids" and "values". Passing those two objects
@@ -823,9 +828,11 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None):
   original matrix, i.e., (3, 6). For our example above, the output will be
   equal to:
 
+  ```python
     SparseTensor(indices=[[0, 0], [1, 1], [1, 3], [1, 4], [2, 0], [2, 3]],
                  values=[-3, 1, 4, 1, 5, 9],
                  shape=[3, 6])
+  ```
 
   Args:
     sp_ids: A `SparseTensor` with `values` property of type `int32`
@@ -834,6 +841,9 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None):
     vocab_size: A scalar `int64` Tensor (or Python int) containing the new size
       of the last dimension, `all(0 <= sp_ids.values < vocab_size)`.
     name: A name prefix for the returned tensors (optional)
+    already_sorted: A boolean to specify whether the per-batch values in
+     `sp_values` are already sorted. If so skip sorting, False by default
+     (optional).
 
   Returns:
     A `SparseTensor` compactly representing a batch of feature ids and values,
@@ -868,7 +878,8 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None):
         [array_ops.slice(sp_ids.shape, [0], array_ops.expand_dims(rank - 1, 0)),
          math_ops.cast(array_ops.pack([vocab_size]), dtypes.int64)])
 
-    return sparse_reorder(ops.SparseTensor(new_indices, new_values, new_shape))
+    result = ops.SparseTensor(new_indices, new_values, new_shape)
+    return result if already_sorted else sparse_reorder(result)
 
 
 def sparse_retain(sp_input, to_retain):
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 4517b6d9978..cfb8dc125ea 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -573,11 +573,19 @@ class VariableScope(object):
     partitioner: callable or `None`: the partitioner passed to `get_variable`.
     custom_getter: default custom getter passed to get_variable.
     name_scope: The name passed to `tf.name_scope`.
+    dtype: default type passed to get_variable (defaults to DT_FLOAT).
   """
 
-  def __init__(self, reuse, name="", initializer=None, regularizer=None,
-               caching_device=None, partitioner=None, custom_getter=None,
-               name_scope=""):
+  def __init__(self,
+               reuse,
+               name="",
+               initializer=None,
+               regularizer=None,
+               caching_device=None,
+               partitioner=None,
+               custom_getter=None,
+               name_scope="",
+               dtype=dtypes.float32):
     """Creates a new VariableScope with the given properties."""
     self._name = name
     self._initializer = initializer
@@ -587,6 +595,7 @@ class VariableScope(object):
     self._partitioner = partitioner
     self._custom_getter = custom_getter
     self._name_scope = name_scope
+    self._dtype = dtype
 
   @property
   def name(self):
@@ -604,6 +613,10 @@ class VariableScope(object):
   def initializer(self):
     return self._initializer
 
+  @property
+  def dtype(self):
+    return self._dtype
+
   @property
   def regularizer(self):
     return self._regularizer
@@ -628,6 +641,10 @@ class VariableScope(object):
     """Set initializer for this scope."""
     self._initializer = initializer
 
+  def set_dtype(self, dtype):
+    """Set data type for this scope."""
+    self._dtype = dtype
+
   def set_regularizer(self, regularizer):
     """Set regularizer for this scope."""
     self._regularizer = regularizer
@@ -644,10 +661,18 @@ class VariableScope(object):
     """Set custom getter for this scope."""
     self._custom_getter = custom_getter
 
-  def get_variable(self, var_store, name, shape=None, dtype=dtypes.float32,
-                   initializer=None, regularizer=None,
-                   trainable=True, collections=None, caching_device=None,
-                   partitioner=None, validate_shape=True,
+  def get_variable(self,
+                   var_store,
+                   name,
+                   shape=None,
+                   dtype=None,
+                   initializer=None,
+                   regularizer=None,
+                   trainable=True,
+                   collections=None,
+                   caching_device=None,
+                   partitioner=None,
+                   validate_shape=True,
                    custom_getter=None):
     """Gets an existing variable with this name or create a new one."""
     if initializer is None:
@@ -660,6 +685,8 @@ class VariableScope(object):
       partitioner = self._partitioner
     if custom_getter is None:
       custom_getter = self._custom_getter
+    if dtype is None:
+      dtype = self._dtype
 
     full_name = self.name + "/" + name if self.name else name
     # Variable names only depend on variable_scope (full_name here),
@@ -672,12 +699,18 @@ class VariableScope(object):
           partitioner=partitioner, validate_shape=validate_shape,
           custom_getter=custom_getter)
 
-  def _get_partitioned_variable(
-      self, var_store, name,
-      shape=None, dtype=dtypes.float32,
-      initializer=None, regularizer=None,
-      trainable=True, collections=None, caching_device=None,
-      partitioner=None, validate_shape=True):
+  def _get_partitioned_variable(self,
+                                var_store,
+                                name,
+                                shape=None,
+                                dtype=None,
+                                initializer=None,
+                                regularizer=None,
+                                trainable=True,
+                                collections=None,
+                                caching_device=None,
+                                partitioner=None,
+                                validate_shape=True):
     """Gets an existing variable with this name or create a new one."""
     if initializer is None:
       initializer = self._initializer
@@ -687,6 +720,9 @@ class VariableScope(object):
       caching_device = self._caching_device
     if partitioner is None:
       partitioner = self._partitioner
+    if dtype is None:
+      dtype = self._dtype
+
     if self._custom_getter is not None:
       raise ValueError(
           "Private access to _get_partitioned_variable is not allowed when "
@@ -743,9 +779,16 @@ def _get_default_variable_store():
   return store
 
 
-def get_variable(name, shape=None, dtype=dtypes.float32, initializer=None,
-                 regularizer=None, trainable=True, collections=None,
-                 caching_device=None, partitioner=None, validate_shape=True,
+def get_variable(name,
+                 shape=None,
+                 dtype=None,
+                 initializer=None,
+                 regularizer=None,
+                 trainable=True,
+                 collections=None,
+                 caching_device=None,
+                 partitioner=None,
+                 validate_shape=True,
                  custom_getter=None):
   """Gets an existing variable with these parameters or create a new one.
 
@@ -830,10 +873,16 @@ def get_variable(name, shape=None, dtype=dtypes.float32, initializer=None,
       custom_getter=custom_getter)
 
 
-def _get_partitioned_variable(
-    name, shape=None, dtype=dtypes.float32, initializer=None,
-    regularizer=None, trainable=True, collections=None,
-    caching_device=None, partitioner=None, validate_shape=True):
+def _get_partitioned_variable(name,
+                              shape=None,
+                              dtype=None,
+                              initializer=None,
+                              regularizer=None,
+                              trainable=True,
+                              collections=None,
+                              caching_device=None,
+                              partitioner=None,
+                              validate_shape=True):
   """Gets or creates a sharded variable list with these parameters.
 
   The `partitioner` must be a callable that accepts a fully defined
@@ -915,10 +964,15 @@ def _get_partitioned_variable(
 
 
 @contextlib.contextmanager
-def _pure_variable_scope(name_or_scope, reuse=None, initializer=None,
-                         regularizer=None, caching_device=None,
-                         partitioner=None, custom_getter=None,
-                         old_name_scope=None):
+def _pure_variable_scope(name_or_scope,
+                         reuse=None,
+                         initializer=None,
+                         regularizer=None,
+                         caching_device=None,
+                         partitioner=None,
+                         custom_getter=None,
+                         old_name_scope=None,
+                         dtype=dtypes.float32):
   """Creates a context for the variable_scope, see `variable_scope` for docs.
 
   Note: this does not create a name scope.
@@ -933,6 +987,7 @@ def _pure_variable_scope(name_or_scope, reuse=None, initializer=None,
     partitioner: default partitioner for variables within this scope.
     custom_getter: default custom getter for variables within this scope.
     old_name_scope: the original name scope when re-entering a variable scope.
+    dtype: type of the variables within this scope (defaults to `DT_FLOAT`).
 
   Yields:
     A scope that can be to captured and reused.
@@ -967,6 +1022,7 @@ def _pure_variable_scope(name_or_scope, reuse=None, initializer=None,
           regularizer=name_or_scope.regularizer,
           caching_device=name_or_scope.caching_device,
           partitioner=name_or_scope.partitioner,
+          dtype=name_or_scope.dtype,
           custom_getter=name_or_scope.custom_getter,
           name_scope=name_scope)
       if initializer is not None:
@@ -979,6 +1035,8 @@ def _pure_variable_scope(name_or_scope, reuse=None, initializer=None,
         default_varscope[0].set_partitioner(partitioner)
       if custom_getter is not None:
         default_varscope[0].set_custom_getter(custom_getter)
+      if dtype is not None:
+        default_varscope[0].set_dtype(dtype)
       yield default_varscope[0]
     else:
       # Handler for the case when we just prolong current variable scope.
@@ -986,11 +1044,13 @@ def _pure_variable_scope(name_or_scope, reuse=None, initializer=None,
       #   reuse and initializer (except if the user provided values to set).
       reuse = reuse or old.reuse  # Re-using is inherited by sub-scopes.
       default_varscope[0] = VariableScope(
-          reuse, name=new_name,
+          reuse,
+          name=new_name,
           initializer=old.initializer,
           regularizer=old.regularizer,
           caching_device=old.caching_device,
           partitioner=old.partitioner,
+          dtype=old.dtype,
           custom_getter=old.custom_getter,
           name_scope=old_name_scope or name_or_scope)
       if initializer is not None:
@@ -1003,6 +1063,8 @@ def _pure_variable_scope(name_or_scope, reuse=None, initializer=None,
         default_varscope[0].set_partitioner(partitioner)
       if custom_getter is not None:
         default_varscope[0].set_custom_getter(custom_getter)
+      if dtype is not None:
+        default_varscope[0].set_dtype(dtype)
       yield default_varscope[0]
   finally:
     var_store.close_variable_subscopes(new_name)
@@ -1024,9 +1086,14 @@ def _get_unique_variable_scope(prefix):
 
 # pylint: disable=g-doc-return-or-yield
 @contextlib.contextmanager
-def variable_scope(name_or_scope, reuse=None, initializer=None,
-                   regularizer=None, caching_device=None, partitioner=None,
-                   custom_getter=None):
+def variable_scope(name_or_scope,
+                   reuse=None,
+                   initializer=None,
+                   regularizer=None,
+                   caching_device=None,
+                   partitioner=None,
+                   custom_getter=None,
+                   dtype=None):
   """Returns a context for variable scope.
 
   Variable scope allows to create new variables and to share already created
@@ -1094,6 +1161,8 @@ def variable_scope(name_or_scope, reuse=None, initializer=None,
     caching_device: default caching device for variables within this scope.
     partitioner: default partitioner for variables within this scope.
     custom_getter: default custom getter for variables within this scope.
+    dtype: type of variables created in this scope (defaults to the type
+      in the passed scope, or inherited from parent scope).
 
   Returns:
     A scope that can be to captured and reused.
@@ -1117,25 +1186,42 @@ def variable_scope(name_or_scope, reuse=None, initializer=None,
       else:
         old_name_scope = name_or_scope.original_name_scope
       with _pure_variable_scope(
-          name_or_scope, reuse=reuse, initializer=initializer,
-          regularizer=regularizer, caching_device=caching_device,
-          partitioner=partitioner, custom_getter=custom_getter,
-          old_name_scope=old_name_scope) as vs:
+          name_or_scope,
+          reuse=reuse,
+          initializer=initializer,
+          regularizer=regularizer,
+          caching_device=caching_device,
+          partitioner=partitioner,
+          custom_getter=custom_getter,
+          old_name_scope=old_name_scope,
+          dtype=dtype) as vs:
         yield vs
   else:
     # This can only happen if someone is entering the root variable scope.
     with _pure_variable_scope(
-        name_or_scope, reuse=reuse, initializer=initializer,
-        regularizer=regularizer, caching_device=caching_device,
-        partitioner=partitioner, custom_getter=custom_getter) as vs:
+        name_or_scope,
+        reuse=reuse,
+        initializer=initializer,
+        regularizer=regularizer,
+        caching_device=caching_device,
+        partitioner=partitioner,
+        custom_getter=custom_getter,
+        dtype=dtype) as vs:
       yield vs
 
 
 # pylint: disable=g-doc-return-or-yield
 @contextlib.contextmanager
-def variable_op_scope(values, name_or_scope, default_name=None,
-                      initializer=None, regularizer=None, caching_device=None,
-                      partitioner=None, custom_getter=None, reuse=None):
+def variable_op_scope(values,
+                      name_or_scope,
+                      default_name=None,
+                      initializer=None,
+                      regularizer=None,
+                      caching_device=None,
+                      partitioner=None,
+                      custom_getter=None,
+                      reuse=None,
+                      dtype=None):
   """Returns a context manager for defining an op that creates variables.
 
   This context manager validates that the given `values` are from the
@@ -1176,6 +1262,8 @@ def variable_op_scope(values, name_or_scope, default_name=None,
     custom_getter: The default custom getter for variables within this scope.
     reuse: `True` or `None`; if `True`, we go into reuse mode for this scope as
       well as all sub-scopes; if `None`, we just inherit the parent scope reuse.
+    dtype: The default type of variables created in this scope, defaults to the
+      type of the parent scope.
 
   Returns:
     A context manager for use in defining a Python op.
@@ -1191,9 +1279,14 @@ def variable_op_scope(values, name_or_scope, default_name=None,
   with g.as_default():
     if name_or_scope:
       with variable_scope(
-          name_or_scope, reuse=reuse, initializer=initializer,
-          regularizer=regularizer, caching_device=caching_device,
-          partitioner=partitioner, custom_getter=custom_getter) as vs:
+          name_or_scope,
+          reuse=reuse,
+          initializer=initializer,
+          regularizer=regularizer,
+          caching_device=caching_device,
+          partitioner=partitioner,
+          custom_getter=custom_getter,
+          dtype=dtype) as vs:
         yield vs
     else:
       if reuse:
@@ -1201,10 +1294,14 @@ def variable_op_scope(values, name_or_scope, default_name=None,
       with ops.name_scope(default_name) as scope:
         unique_default_name = _get_unique_variable_scope(default_name)
         with _pure_variable_scope(
-            unique_default_name, initializer=initializer,
-            regularizer=regularizer, caching_device=caching_device,
-            partitioner=partitioner, custom_getter=custom_getter,
-            old_name_scope=scope) as vs:
+            unique_default_name,
+            initializer=initializer,
+            regularizer=regularizer,
+            caching_device=caching_device,
+            partitioner=partitioner,
+            custom_getter=custom_getter,
+            old_name_scope=scope,
+            dtype=dtype) as vs:
           yield vs
 
 
diff --git a/tensorflow/python/platform/base.i b/tensorflow/python/platform/base.i
index 99aae3b2416..df40491ed30 100644
--- a/tensorflow/python/platform/base.i
+++ b/tensorflow/python/platform/base.i
@@ -146,6 +146,7 @@ std::vector<type>* OUTPUT (std::vector<type> temp),
 %enddef
 
 _LIST_OUTPUT_TYPEMAP(string, _SwigBytes_FromString);
+_LIST_OUTPUT_TYPEMAP(long long, PyLong_FromLongLong);
 _LIST_OUTPUT_TYPEMAP(unsigned long long, PyLong_FromUnsignedLongLong);
 
 %typemap(in) uint64 {
@@ -178,6 +179,7 @@ _LIST_OUTPUT_TYPEMAP(unsigned long long, PyLong_FromUnsignedLongLong);
 %enddef
 
 _COPY_TYPEMAPS(unsigned long long, uint64);
+_COPY_TYPEMAPS(long long, int64);
 
 // SWIG macros for explicit API declaration.
 // Usage:
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index 1ba89db5628..23c03c38b13 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -164,6 +164,7 @@ class Benchmark(six.with_metaclass(_BenchmarkRegistrar, object)):
       wall_time: (optional) Total wall time in seconds
       throughput: (optional) Throughput (in MB/s)
       extras: (optional) Dict mapping string keys to additional benchmark info.
+        Values may be either floats or values that are convertible to strings.
       name: (optional) Override the BenchmarkEntry name with `name`.
         Otherwise it is inferred from the top-level method name.
     """
@@ -189,7 +190,8 @@ class TensorFlowBenchmark(Benchmark):
                        burn_iters=2,
                        min_iters=10,
                        store_trace=False,
-                       name=None):
+                       name=None,
+                       extras=None):
     """Run an op or tensor in the given session.  Report the results.
 
     Args:
@@ -205,6 +207,8 @@ class TensorFlowBenchmark(Benchmark):
         in the extras field "full_trace_chrome_format".
       name: (optional) Override the BenchmarkEntry name with `name`.
         Otherwise it is inferred from the top-level method name.
+      extras: (optional) Dict mapping string keys to additional benchmark info.
+        Values may be either floats or values that are convertible to strings.
     """
     for _ in range(burn_iters):
       sess.run(op_or_tensor, feed_dict=feed_dict)
@@ -218,7 +222,7 @@ class TensorFlowBenchmark(Benchmark):
       delta = end_time - start_time
       deltas[i] = delta
 
-    extras = {}
+    extras = extras if extras is not None else {}
     if store_trace:
       run_options = config_pb2.RunOptions(
           trace_level=config_pb2.RunOptions.FULL_TRACE)
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index ffd211152b9..ef82a009f92 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -28,6 +28,7 @@ limitations under the License.
 
 %include "tensorflow/python/client/tf_session.i"
 %include "tensorflow/python/client/device_lib.i"
+%include "tensorflow/python/client/net_lib.i"
 %include "tensorflow/python/client/quantize_training.i"
 
 %include "tensorflow/python/lib/io/file_io.i"
diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py
index a7fc169d18a..ea583006cc8 100644
--- a/tensorflow/python/training/coordinator.py
+++ b/tensorflow/python/training/coordinator.py
@@ -150,6 +150,11 @@ class Coordinator(object):
     self._exc_info_to_raise = None
     # True if we have called join() already.
     self._joined = False
+    # Set of threads registered for joining when join() is called.  These
+    # threads will be joined in addition to the threads passed to the join()
+    # call.  It's ok if threads are both registered and passed to the join()
+    # call.
+    self._registered_threads = set()
 
   def _filter_exception(self, ex):
     """Check if the exception indicated in 'ex' should be ignored.
@@ -305,10 +310,22 @@ class Coordinator(object):
     """
     return self._stop_event.wait(timeout)
 
-  def join(self, threads, stop_grace_period_secs=120):
+  def register_thread(self, thread):
+    """Register a thread to join.
+
+    Args:
+      thread: A Python thread to join.
+    """
+    with self._lock:
+      self._registered_threads.add(thread)
+
+  def join(self, threads=None, stop_grace_period_secs=120):
     """Wait for threads to terminate.
 
-    Blocks until all `threads` have terminated or `request_stop()` is called.
+    This call blocks until a set of threads have terminated.  The set of thread
+    is the union of the threads passed in the `threads` argument and the list
+    of threads that registered with the coordinator by calling
+    `Coordinator.register_thread()`.
 
     After the threads stop, if an `exc_info` was passed to `request_stop`, that
     exception is re-raised.
@@ -320,7 +337,8 @@ class Coordinator(object):
     that `RuntimeError`.
 
     Args:
-      threads: List of `threading.Threads`. The started threads to join.
+      threads: List of `threading.Threads`. The started threads to join in
+        addition to the registered threads.
       stop_grace_period_secs: Number of seconds given to threads to stop after
         `request_stop()` has been called.
 
@@ -328,6 +346,13 @@ class Coordinator(object):
       RuntimeError: If any thread is still alive after `request_stop()`
         is called and the grace period expires.
     """
+    # Threads registered after this call will not be joined.
+    with self._lock:
+      if threads is None:
+        threads = self._registered_threads
+      else:
+        threads = self._registered_threads.union(set(threads))
+
     # Wait for all threads to stop or for request_stop() to be called.
     while any(t.is_alive() for t in threads) and not self.wait_for_stop(1.0):
       pass
@@ -353,6 +378,7 @@ class Coordinator(object):
     # Terminate with an exception if appropriate.
     with self._lock:
       self._joined = True
+      self._registered_threads = set()
       if self._exc_info_to_raise:
         six.reraise(*self._exc_info_to_raise)
       elif stragglers:
@@ -411,6 +437,7 @@ class LooperThread(threading.Thread):
     elif args or kwargs:
       raise ValueError("'args' and 'kwargs' argument require that you also "
                        "pass 'target'")
+    self._coord.register_thread(self)
 
   @staticmethod
   def loop(coord, timer_interval_secs, target, args=None, kwargs=None):
diff --git a/tensorflow/python/training/coordinator_test.py b/tensorflow/python/training/coordinator_test.py
index 764307fd7d2..d67fb459d83 100644
--- a/tensorflow/python/training/coordinator_test.py
+++ b/tensorflow/python/training/coordinator_test.py
@@ -47,7 +47,9 @@ def RaiseInNUsingContextHandler(coord, n_secs, ex):
     raise ex
 
 
-def SleepABit(n_secs):
+def SleepABit(n_secs, coord=None):
+  if coord:
+    coord.register_thread(threading.current_thread())
   time.sleep(n_secs)
 
 
@@ -80,6 +82,33 @@ class CoordinatorTest(tf.test.TestCase):
     for t in threads:
       t.start()
     coord.join(threads)
+    for t in threads:
+      self.assertFalse(t.is_alive())
+
+  def testJoinAllRegistered(self):
+    coord = tf.train.Coordinator()
+    threads = [
+        threading.Thread(target=SleepABit, args=(0.01, coord)),
+        threading.Thread(target=SleepABit, args=(0.02, coord)),
+        threading.Thread(target=SleepABit, args=(0.01, coord))]
+    for t in threads:
+      t.start()
+    coord.join()
+    for t in threads:
+      self.assertFalse(t.is_alive())
+
+  def testJoinSomeRegistered(self):
+    coord = tf.train.Coordinator()
+    threads = [
+        threading.Thread(target=SleepABit, args=(0.01, coord)),
+        threading.Thread(target=SleepABit, args=(0.02)),
+        threading.Thread(target=SleepABit, args=(0.01, coord))]
+    for t in threads:
+      t.start()
+    # threads[1] is not registred we must pass it in.
+    coord.join(threads[1:1])
+    for t in threads:
+      self.assertFalse(t.is_alive())
 
   def testJoinGraceExpires(self):
     def TestWithGracePeriod(stop_grace_period):
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index f24f1f4a087..ef369e90953 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -54,7 +54,7 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
                                              100000, 0.96, staircase=True)
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
-      tf.GradientDescentOptimizer(learning_rate)
+      tf.train.GradientDescentOptimizer(learning_rate)
       .minimize(...my loss..., global_step=global_step)
   )
   ```
@@ -195,7 +195,7 @@ def polynomial_decay(learning_rate, global_step, decay_steps,
                                             power=0.5)
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
-      tf.GradientDescentOptimizer(learning_rate)
+      tf.train.GradientDescentOptimizer(learning_rate)
       .minimize(...my loss..., global_step=global_step)
   )
   ```
@@ -268,7 +268,7 @@ def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate,
 
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
-      tf.GradientDescentOptimizer(learning_rate)
+      tf.train.GradientDescentOptimizer(learning_rate)
       .minimize(...my loss..., global_step=global_step)
   )
   ```
@@ -327,7 +327,7 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
 
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
-      tf.GradientDescentOptimizer(learning_rate)
+      tf.train.GradientDescentOptimizer(learning_rate)
       .minimize(...my loss..., global_step=global_step)
   )
   ```
diff --git a/tensorflow/python/training/localhost_cluster_performance_test.py b/tensorflow/python/training/localhost_cluster_performance_test.py
new file mode 100644
index 00000000000..a679cd36a25
--- /dev/null
+++ b/tensorflow/python/training/localhost_cluster_performance_test.py
@@ -0,0 +1,133 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests and benchmarks for creating RPC clusters on localhost."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.util import net_lib
+
+
+def create_local_cluster(num_workers, num_ps, protocol="grpc"):
+  """Create local GRPC servers and return their servers."""
+  worker_ports = [net_lib.pick_unused_port_or_die() for _ in range(num_workers)]
+  ps_ports = [net_lib.pick_unused_port_or_die() for _ in range(num_ps)]
+  cluster_dict = {
+      "worker": ["localhost:%s" % port for port in worker_ports],
+      "ps": ["localhost:%s" % port for port in ps_ports]}
+  cs = tf.train.ClusterSpec(cluster_dict)
+
+  workers = [
+      tf.train.Server(
+          cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_workers)]
+  ps_servers = [
+      tf.train.Server(
+          cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_ps)]
+
+  return workers, ps_servers
+
+
+class CreateLocalClusterTest(tf.test.TestCase):
+
+  def testCreateLocalCluster(self):
+    workers, _ = create_local_cluster(num_workers=2, num_ps=2)
+    worker_sessions = [tf.Session(w.target) for w in workers]
+    with tf.device("/job:ps/task:0"):
+      var0 = tf.Variable(0.0)
+    with tf.device("/job:ps/task:1"):
+      var1 = tf.Variable(1.0)
+    worker_sessions[0].run([var0.initializer, var1.initializer])
+    with tf.device("/job:ps/task:0"):
+      var2 = tf.Variable(2.0)
+    with tf.device("/job:ps/task:1"):
+      var3 = tf.Variable(3.0)
+    worker_sessions[1].run([var2.initializer, var3.initializer])
+
+    # Read values back in the opposite session
+    self.assertAllEqual(0.0, var0.eval(session=worker_sessions[1]))
+    self.assertAllEqual(1.0, var1.eval(session=worker_sessions[1]))
+    self.assertAllEqual(2.0, var2.eval(session=worker_sessions[0]))
+    self.assertAllEqual(3.0, var3.eval(session=worker_sessions[0]))
+
+
+class CreateLocalClusterBenchmark(tf.test.Benchmark):
+
+  def benchmarkCreateLocalCluster(self):
+    deltas = []
+    iters = 50
+    for _ in range(iters):
+      start_time = time.time()
+      create_local_cluster(num_workers=1, num_ps=10)
+      end_time = time.time()
+      deltas.append(end_time - start_time)
+
+    median_deltas = np.median(deltas)
+    print(
+        "\n\nbenchmark_create_local_cluster_1_worker_10_ps.  "
+        "iterations: %d, median wall time: %g\n\n" % (iters, median_deltas))
+    self.report_benchmark(
+        iters=iters,
+        wall_time=median_deltas,
+        name="benchmark_create_local_cluster_1_worker_10_ps")
+
+
+class PartitionedVariablesBenchmark(tf.test.Benchmark):
+
+  def benchmark_create_1000_partitions_with_100_parameter_servers(self):
+    workers, _ = create_local_cluster(num_workers=1, num_ps=100)
+    worker_sessions = [tf.Session(w.target) for w in workers]
+    worker = worker_sessions[0]
+    partition_sizes = (1, 512, 1024*32, 1024*128)
+
+    partitioned = []
+
+    for partition_size in partition_sizes:
+      # max_shard_bytes is 4, shape is 1000*partition_size float32s which should
+      # partition into 1000 shards, each containing partition_size float32s.
+      print("Building partitioned variable with %d floats per partition"
+            % partition_size)
+      with tf.device(tf.train.replica_device_setter(ps_tasks=100)):
+        partitioned_ix = tf.get_variable(
+            "partitioned_%d" % partition_size,
+            shape=[1000 * partition_size],
+            dtype=tf.float32,
+            # Each partition to have exactly N float32s
+            partitioner=tf.variable_axis_size_partitioner(
+                max_shard_bytes=4 * partition_size))
+        # Concatenates along axis 0
+        partitioned.append(tf.convert_to_tensor(partitioned_ix))
+
+    tf.initialize_all_variables().run(session=worker)
+
+    for ix, partition_size in enumerate(partition_sizes):
+      print("Running benchmark having partitions with %d floats"
+            % partition_size)
+      self.run_op_benchmark(
+          worker,
+          partitioned[ix],
+          name=("read_concat_1000_partitions_from_"
+                "100_parameter_servers_partsize_%d_floats" % partition_size))
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/training/queue_runner.py b/tensorflow/python/training/queue_runner.py
index d31aca36f60..db3ee9d5280 100644
--- a/tensorflow/python/training/queue_runner.py
+++ b/tensorflow/python/training/queue_runner.py
@@ -176,6 +176,8 @@ class QueueRunner(object):
       coord: Optional Coordinator object for reporting errors and checking
         for stop conditions.
     """
+    if coord:
+      coord.register_thread(threading.current_thread())
     decremented = False
     try:
       while True:
@@ -218,6 +220,7 @@ class QueueRunner(object):
       cancel_op: The Operation to run.
       coord: Coordinator.
     """
+    coord.register_thread(threading.current_thread())
     coord.wait_for_stop()
     try:
       sess.run(cancel_op)
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index a5bc6bb4adb..6487e32892d 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -122,7 +122,7 @@ class QueueRunnerTest(tf.test.TestCase):
       threads = qr.create_threads(sess, coord)
       for t in threads:
         t.start()
-      coord.join(threads)
+      coord.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 0.
       self.assertEqual(0, var.eval())
@@ -137,7 +137,7 @@ class QueueRunnerTest(tf.test.TestCase):
         t.start()
       # The exception should be re-raised when joining.
       with self.assertRaisesRegexp(ValueError, "Operation not in the graph"):
-        coord.join(threads)
+        coord.join()
 
   def testGracePeriod(self):
     with self.test_session() as sess:
@@ -147,14 +147,14 @@ class QueueRunnerTest(tf.test.TestCase):
       dequeue = queue.dequeue()
       qr = tf.train.QueueRunner(queue, [enqueue])
       coord = tf.train.Coordinator()
-      threads = qr.create_threads(sess, coord, start=True)
+      qr.create_threads(sess, coord, start=True)
       # Dequeue one element and then request stop.
       dequeue.op.run()
       time.sleep(0.02)
       coord.request_stop()
       # We should be able to join because the RequestStop() will cause
       # the queue to be closed and the enqueue to terminate.
-      coord.join(threads, stop_grace_period_secs=0.05)
+      coord.join(stop_grace_period_secs=0.05)
 
   def testIgnoreMultiStarts(self):
     with self.test_session() as sess:
@@ -171,7 +171,7 @@ class QueueRunnerTest(tf.test.TestCase):
       new_threads = qr.create_threads(sess, coord=coord)
       self.assertEqual([], new_threads)
       coord.request_stop()
-      coord.join(threads, stop_grace_period_secs=0.5)
+      coord.join(stop_grace_period_secs=0.5)
 
   def testThreads(self):
     with self.test_session() as sess:
diff --git a/tensorflow/python/training/saver_large_variable_test.py b/tensorflow/python/training/saver_large_variable_test.py
new file mode 100644
index 00000000000..40f0a47e430
--- /dev/null
+++ b/tensorflow/python/training/saver_large_variable_test.py
@@ -0,0 +1,49 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for tensorflow.python.training.saver.py."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+
+
+class SaverLargeVariableTest(tf.test.TestCase):
+
+  # NOTE: This is in a separate file from saver_test.py because the
+  # large allocations do not play well with TSAN, and cause flaky
+  # failures.
+  def testLargeVariable(self):
+    save_path = os.path.join(self.get_temp_dir(), "large_variable")
+    with tf.Session("", graph=tf.Graph()) as sess:
+      # Declare a variable that is exactly 2GB. This should fail,
+      # because a serialized checkpoint includes other header
+      # metadata.
+      with tf.device("/cpu:0"):
+        var = tf.Variable(
+            tf.constant(False, shape=[2, 1024, 1024, 1024], dtype=tf.bool))
+      save = tf.train.Saver({var.op.name: var})
+      var.initializer.run()
+      with self.assertRaisesRegexp(
+          tf.errors.InvalidArgumentError,
+          "Tensor slice is too large to serialize"):
+        save.save(sess, save_path)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 629ba4bbc14..68904bb89e7 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -287,22 +287,6 @@ class SaverTest(tf.test.TestCase):
         expected_save_path = "%s-%d" % (save_path, global_step_int)
         self.assertEqual(expected_save_path, val)
 
-  def testLargeVariable(self):
-    save_path = os.path.join(self.get_temp_dir(), "large_variable")
-    with tf.Session("", graph=tf.Graph()) as sess:
-      # Declare a variable that is exactly 2GB. This should fail,
-      # because a serialized checkpoint includes other header
-      # metadata.
-      with tf.device("/cpu:0"):
-        var = tf.Variable(
-            tf.constant(False, shape=[2, 1024, 1024, 1024], dtype=tf.bool))
-      save = tf.train.Saver({var.op.name: var})
-      var.initializer.run()
-      with self.assertRaisesRegexp(
-          tf.errors.InvalidArgumentError,
-          "Tensor slice is too large to serialize"):
-        save.save(sess, save_path)
-
 
 class SaveRestoreShardedTest(tf.test.TestCase):
 
@@ -611,7 +595,7 @@ class MaxToKeepTest(tf.test.TestCase):
       self.assertEqual([], save2.last_checkpoints)
       self.assertTrue(gfile.Exists(s2))
 
-  def testNoMetaGrap(self):
+  def testNoMetaGraph(self):
     save_dir = _TestDir("no_meta_graph")
 
     with self.test_session() as sess:
diff --git a/tensorflow/python/training/server_lib.i b/tensorflow/python/training/server_lib.i
index 6f5f3d4fa56..94250304f85 100644
--- a/tensorflow/python/training/server_lib.i
+++ b/tensorflow/python/training/server_lib.i
@@ -58,9 +58,9 @@ limitations under the License.
 }
 
 %{
+#include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/util/tf_status_helper.h"
 
 using tensorflow::ServerDef;
 
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index 35505b82870..a3ee383758b 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -296,7 +296,6 @@ class Supervisor(object):
     self._graph = graph
     self._is_chief = is_chief
     self._coord = coordinator.Coordinator()
-    self._started_threads = []
     self._recovery_wait_secs = recovery_wait_secs
     self._stop_grace_secs = stop_grace_secs
     self._init_fn = init_fn
@@ -636,8 +635,6 @@ class Supervisor(object):
       threads.append(SVTimerCheckpointThread(self, sess))
     for t in threads:
       t.start()
-    self._started_threads.extend(threads)
-
     return threads
 
   def prepare_or_wait_for_session(self, master="", config=None,
@@ -712,7 +709,6 @@ class Supervisor(object):
     for qr in queue_runners:
       threads.extend(qr.create_threads(sess, coord=self._coord, daemon=True,
                                        start=True))
-    self._started_threads.extend(threads)
     return threads
 
   def loop(self, timer_interval_secs, target, args=None, kwargs=None):
@@ -737,7 +733,6 @@ class Supervisor(object):
     looper = coordinator.LooperThread(self._coord, timer_interval_secs,
                                       target=target, args=args, kwargs=kwargs)
     looper.start()
-    self._started_threads.append(looper)
     return looper
 
   def stop(self, threads=None, close_summary_writer=True):
@@ -755,16 +750,12 @@ class Supervisor(object):
         `True` if the summary writer was created by the supervisor, `False`
         otherwise.
     """
-    join_threads = []
-    join_threads.extend(self._started_threads)
-    if threads is not None:
-      join_threads.extend(threads)
     self._coord.request_stop()
     try:
       # coord.join() re-raises the first reported exception; the "finally"
       # block ensures that we clean up whether or not an exception was
       # reported.
-      self._coord.join(join_threads,
+      self._coord.join(threads,
                        stop_grace_period_secs=self._stop_grace_secs)
     finally:
       # Close the writer last, in case one of the running threads was using it.
@@ -775,8 +766,6 @@ class Supervisor(object):
         self._summary_writer.close()
         self._graph_added_to_summary = False
 
-      self._started_threads = []
-
   def request_stop(self, ex=None):
     """Request that the coordinator stop the threads.
 
diff --git a/tensorflow/python/util/net_lib.py b/tensorflow/python/util/net_lib.py
new file mode 100644
index 00000000000..98a3149fdba
--- /dev/null
+++ b/tensorflow/python/util/net_lib.py
@@ -0,0 +1,28 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A Python interface for creating TensorFlow tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six  # pylint: disable=unused-import
+
+from tensorflow.python import pywrap_tensorflow
+
+
+def pick_unused_port_or_die():
+  """Find an unused port on localhost."""
+  return pywrap_tensorflow.PickUnusedPortOrDie()
diff --git a/tensorflow/python/util/net_lib_test.py b/tensorflow/python/util/net_lib_test.py
new file mode 100644
index 00000000000..1e2ad53cdae
--- /dev/null
+++ b/tensorflow/python/util/net_lib_test.py
@@ -0,0 +1,39 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for the SWIG-wrapped test lib."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.util import net_lib
+
+
+class TestLibTest(tf.test.TestCase):
+
+  def testPickUnusedPortOrDie(self):
+    port0 = net_lib.pick_unused_port_or_die()
+    port1 = net_lib.pick_unused_port_or_die()
+    self.assertGreater(port0, 0)
+    self.assertLess(port0, 65536)
+    self.assertGreater(port1, 0)
+    self.assertLess(port1, 65536)
+    self.assertNotEqual(port0, port1)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/util/py_checkpoint_reader.i b/tensorflow/python/util/py_checkpoint_reader.i
index 39e14924dc3..acec8d03ab1 100644
--- a/tensorflow/python/util/py_checkpoint_reader.i
+++ b/tensorflow/python/util/py_checkpoint_reader.i
@@ -17,8 +17,8 @@ limitations under the License.
 %include "tensorflow/python/platform/base.i"
 
 %{
+#include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/util/checkpoint_reader.h"
 #include "tensorflow/python/lib/core/py_func.h"
 %}
 
@@ -126,5 +126,5 @@ def NewCheckpointReader(filepattern):
     return CheckpointReader(compat.as_bytes(filepattern), status)
 %}
 
-%include "tensorflow/core/util/checkpoint_reader.h"
+%include "tensorflow/c/checkpoint_reader.h"
 %unignoreall
diff --git a/tensorflow/tensorboard/README.md b/tensorflow/tensorboard/README.md
index a53a80eb478..48d5ce4ccbc 100644
--- a/tensorflow/tensorboard/README.md
+++ b/tensorflow/tensorboard/README.md
@@ -54,18 +54,18 @@ work, but there may be bugs or performance issues.
 
 The first step in using TensorBoard is acquiring data from your TensorFlow run.
 For this, you need [summary
-ops](https://www.tensorflow.org/versions/r0.9/api_docs/python/train.html#summary-operations).
+ops](https://www.tensorflow.org/versions/r0.10/api_docs/python/train.html#summary-operations).
 Summary ops are ops, like
-[`tf.matmul`](https://www.tensorflow.org/versions/r0.9/api_docs/python/math_ops.html#matmul)
+[`tf.matmul`](https://www.tensorflow.org/versions/r0.10/api_docs/python/math_ops.html#matmul)
 or
-[`tf.nn.relu`](https://www.tensorflow.org/versions/r0.9/api_docs/python/nn.html#relu),
+[`tf.nn.relu`](https://www.tensorflow.org/versions/r0.10/api_docs/python/nn.html#relu),
 which means they take in tensors, produce tensors, and are evaluated from within
 a TensorFlow graph. However, summary ops have a twist: the Tensors they produce
 contain serialized protobufs, which are written to disk and sent to TensorBoard.
 To visualize the summary data in TensorBoard, you should evaluate the summary
 op, retrieve the result, and then write that result to disk using a
 SummaryWriter. A full explanation, with examples, is in [the
-tutorial](https://www.tensorflow.org/versions/r0.9/how_tos/summaries_and_tensorboard/index.html).
+tutorial](https://www.tensorflow.org/versions/r0.10/how_tos/summaries_and_tensorboard/index.html).
 
 ### Tags: Giving names to data
 
@@ -105,7 +105,8 @@ For example, here is a well-organized TensorBoard log directory, with two runs,
 "run1" and "run2".
 
 ```
-/some/path/mnist_experiments/ some/path/mnist_experiments/run1/
+/some/path/mnist_experiments/
+/some/path/mnist_experiments/run1/
 /some/path/mnist_experiments/run1/events.out.tfevents.1456525581.name
 /some/path/mnist_experiments/run1/events.out.tfevents.1456525585.name
 /some/path/mnist_experiments/run2/
@@ -113,6 +114,14 @@ For example, here is a well-organized TensorBoard log directory, with two runs,
 /tensorboard --logdir=/some/path/mnist_experiments
 ```
 
+You may also pass a comma separated list of log directories, and TensorBoard
+will watch each directory. You can also assign names to individual log
+directories by putting a colon between the name and the path, as in
+
+```
+tensorboard --logdir=name1:/path/to/logs/1,name2:/path/to/logs/2
+```
+
 # The Visualizations
 
 ### Events Dashboard
@@ -178,7 +187,7 @@ TensorFlow model. To get best use of the graph visualizer, you should use name
 scopes to hierarchically group the ops in your graph - otherwise, the graph may
 be difficult to decipher. For more information, including examples, see [the
 graph visualizer
-tutorial](https://www.tensorflow.org/versions/r0.9/how_tos/graph_viz/index.html#tensorboard-graph-visualization).
+tutorial](https://www.tensorflow.org/versions/r0.10/how_tos/graph_viz/index.html#tensorboard-graph-visualization).
 
 # Frequently Asked Questions
 
diff --git a/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-dashboard.html b/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-dashboard.html
index 3c803247ac8..6b7ccb0f27c 100644
--- a/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-dashboard.html
+++ b/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-dashboard.html
@@ -25,8 +25,6 @@ tf-audio-dashboard displays a dashboard that loads audio from a TensorFlow run.
 
     <style>
       .center {
-        padding-left: 10px;
-        padding-right: 10px;
         height: 100%;
         width: 100%;
         -webkit-box-sizing: border-box;
diff --git a/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-grid.html b/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-grid.html
index 27d7220ebee..d48dddcd1d3 100644
--- a/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-grid.html
+++ b/tensorflow/tensorboard/components/tf-audio-dashboard/tf-audio-grid.html
@@ -91,7 +91,7 @@ is high)
         height: 100%;
         flex-direction: column;
         padding-top: 20px;
-        overflow: scroll;
+        overflow: auto;
         -webkit-box-sizing: border-box;
         -moz-box-sizing: border-box;
         box-sizing: border-box;
diff --git a/tensorflow/tensorboard/components/tf-dashboard-common/dashboard-style.html b/tensorflow/tensorboard/components/tf-dashboard-common/dashboard-style.html
index 969b47bb60b..095a6cf9a59 100644
--- a/tensorflow/tensorboard/components/tf-dashboard-common/dashboard-style.html
+++ b/tensorflow/tensorboard/components/tf-dashboard-common/dashboard-style.html
@@ -31,9 +31,9 @@
         display: flex;
       }
       .card .card-bottom-row {
-        flex-grow: 0;
-        flex-shrink: 0;
-        padding-left: 10px;
+        position: absolute;
+        left: 50px;
+        bottom: 0;
         padding-right: 10px;
       }
 
@@ -42,14 +42,10 @@
         width: 100%;
       }
 
-      [shift] {
-        bottom: 20px !important;
-      }
-
       .expand-button {
         position: absolute;
         left: 0px;
-        bottom: 20px;
+        bottom: 0px;
         color: #2196F3;
         display: block;
       }
diff --git a/tensorflow/tensorboard/components/tf-dashboard-common/tf-dashboard-layout.html b/tensorflow/tensorboard/components/tf-dashboard-common/tf-dashboard-layout.html
index a5584ec3425..faf1139922c 100644
--- a/tensorflow/tensorboard/components/tf-dashboard-common/tf-dashboard-layout.html
+++ b/tensorflow/tensorboard/components/tf-dashboard-common/tf-dashboard-layout.html
@@ -25,7 +25,7 @@ Generic layout for a dashboard.
 
       #center {
         height: 100%;
-        overflow-y: scroll;
+        overflow-y: auto;
         flex-grow: 1;
         flex-shrink: 1;
       }
diff --git a/tensorflow/tensorboard/components/tf-dashboard-common/tf-downloader.html b/tensorflow/tensorboard/components/tf-dashboard-common/tf-downloader.html
index 6462d5497fd..e3a491389b2 100644
--- a/tensorflow/tensorboard/components/tf-dashboard-common/tf-downloader.html
+++ b/tensorflow/tensorboard/components/tf-dashboard-common/tf-downloader.html
@@ -29,7 +29,7 @@
         display: block;
       }
       paper-dropdown-menu {
-        width: 220px;
+        width: 180px;
         --paper-input-container-label: {
           font-size: 10px;
         }
diff --git a/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-obsolete-histogram-chart.html b/tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-chart.html
similarity index 89%
rename from tensorflow/tensorboard/components/tf-histogram-dashboard/tf-obsolete-histogram-chart.html
rename to tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-chart.html
index d5ddf142a83..d58520a8a7c 100644
--- a/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-obsolete-histogram-chart.html
+++ b/tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-chart.html
@@ -2,7 +2,7 @@
 <link rel="import" href="../tf-imports/plottable.html">
 <link rel="import" href="../tf-imports/lodash.html">
 
-<dom-module id="tf-obsolete-histogram-chart">
+<dom-module id="tf-distribution-chart">
   <template>
     <svg id="chartsvg"></svg>
     <style>
@@ -24,11 +24,11 @@
 
     </style>
   </template>
-  <script src="tf-obsolete-histogram-chart.js"></script>
+  <script src="tf-distribution-chart.js"></script>
   <script src="../vz-line-chart/vz-chart-helpers.js"></script>
   <script>
     Polymer({
-      is: "tf-obsolete-histogram-chart",
+      is: "tf-distribution-chart",
       properties: {
         _chart: Object,
         colorScale: Object,
@@ -60,7 +60,7 @@
           return;
         }
         if (this._chart) this._chart.destroy();
-        var chart = new TF.HistogramChart(tag, dataProvider, xType, colorScale);
+        var chart = new TF.DistributionChart(tag, dataProvider, xType, colorScale);
         var svg = d3.select(this.$.chartsvg);
         this.async(function() {
           chart.renderTo(svg);
diff --git a/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-obsolete-histogram-chart.ts b/tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-chart.ts
similarity index 99%
rename from tensorflow/tensorboard/components/tf-histogram-dashboard/tf-obsolete-histogram-chart.ts
rename to tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-chart.ts
index c593b9d4d48..981656a8146 100644
--- a/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-obsolete-histogram-chart.ts
+++ b/tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-chart.ts
@@ -15,7 +15,7 @@ limitations under the License.
 /* tslint:disable:no-namespace variable-name */
 
 module TF {
-  export class HistogramChart {
+  export class DistributionChart {
     protected dataFn: VZ.ChartHelpers.DataFn;
     protected tag: string;
     private run2datasets: {[run: string]: Plottable.Dataset};
diff --git a/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-histogram-dashboard.html b/tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-dashboard.html
similarity index 91%
rename from tensorflow/tensorboard/components/tf-histogram-dashboard/tf-histogram-dashboard.html
rename to tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-dashboard.html
index 6dab1bda03e..962dcdef133 100644
--- a/tensorflow/tensorboard/components/tf-histogram-dashboard/tf-histogram-dashboard.html
+++ b/tensorflow/tensorboard/components/tf-distribution-dashboard/tf-distribution-dashboard.html
@@ -4,7 +4,7 @@
 <link rel="import" href="../tf-color-scale/tf-color-scale.html">
 <link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
 <link rel="import" href="../tf-categorizer/tf-categorizer.html">
-<link rel="import" href="tf-obsolete-histogram-chart.html">
+<link rel="import" href="tf-distribution-chart.html">
 <link rel="import" href="../tf-collapsable-pane/tf-collapsable-pane.html">
 <link rel="import" href="../iron-collapse/iron-collapse.html">
 <link rel="import" href="../paper-icon-button/paper-icon-button.html">
@@ -12,7 +12,7 @@
 <link rel="import" href="../tf-backend/tf-backend.html">
 
 <!--
-tf-histogram-dashboard is a complete frontend that loads runs from a backend,
+tf-distribution-dashboard is a complete frontend that loads runs from a backend,
 and creates chart panes that display data for those runs.
 
 It provides a categorizer, run selector, and x type selector, by which the user
@@ -24,10 +24,10 @@ charts are larger.
 Organizationally, the #plumbing div contains components that have no concrete
 manifestation and just effect data bindings or data loading. The #sidebar contains
 shared controls like the tf-categorizer, tf-run-selector, and tf-x-type-selector.
-The #center div contains tf-obsolete-histogram-charts embedded inside
+The #center div contains tf-distribution-charts embedded inside
 tf-collapsable-panes.
 -->
-<dom-module id="tf-histogram-dashboard">
+<dom-module id="tf-distribution-dashboard">
   <template>
     <div id="plumbing">
       <tf-color-scale
@@ -64,7 +64,7 @@ tf-collapsable-panes.
 
       <div class="center">
         <tf-no-data-warning
-          data-type="histogram"
+          data-type="distribution"
           show-warning="[[dataNotFound]]"
         ></tf-no-data-warning>
         <template is="dom-repeat" items="[[categories]]">
@@ -76,7 +76,7 @@ tf-collapsable-panes.
                     <div class="card">
                       <span class="card-title">[[tag]]</span>
                       <div class="card-content">
-                        <tf-obsolete-histogram-chart
+                        <tf-distribution-chart
                           tag="[[tag]]"
                           id="chart"
                           selected-runs="[[_array(run)]]"
@@ -85,7 +85,7 @@ tf-collapsable-panes.
                           color-scale="[[colorScale]]"
                           on-keyup="toggleSelected"
                           tabindex="2"
-                        ></tf-obsolete-histogram-chart>
+                        ></tf-distribution-chart>
                         <paper-icon-button
                           class="expand-button"
                           icon="fullscreen"
@@ -107,9 +107,9 @@ tf-collapsable-panes.
 
   <script>
     Polymer({
-      is: "tf-histogram-dashboard",
+      is: "tf-distribution-dashboard",
       behaviors: [
-        TF.Dashboard.ReloadBehavior("tf-obsolete-histogram-chart"),
+        TF.Dashboard.ReloadBehavior("tf-distribution-chart"),
         TF.Backend.Behavior,
       ],
       properties: {
diff --git a/tensorflow/tensorboard/components/tf-event-dashboard/tf-event-dashboard.html b/tensorflow/tensorboard/components/tf-event-dashboard/tf-event-dashboard.html
index bd734c270e4..1f47e41cbd8 100644
--- a/tensorflow/tensorboard/components/tf-event-dashboard/tf-event-dashboard.html
+++ b/tensorflow/tensorboard/components/tf-event-dashboard/tf-event-dashboard.html
@@ -102,7 +102,6 @@ The #center div contains tf-line-charts embedded inside tf-collapsable-panes.
                     </tf-chart-scaffold>
                     <paper-icon-button
                       class="expand-button"
-                      shift$="[[_showDownloadLinks]]"
                       icon="fullscreen"
                       on-tap="toggleSelected"
                     ></paper-icon-button>
@@ -167,13 +166,6 @@ The #center div contains tf-line-charts embedded inside tf-collapsable-panes.
           this.fire("rendered");
         });
       },
-      observers: ['redraw(_showDownloadLinks)'],
-      redraw: function(_showDownloadLinks) {
-        var els = this.getElementsByTagName("tf-line-chart");
-        for (var i=0; i<els.length; i++) {
-          els[i].redraw();
-        }
-      },
       _getVisibleTags: function() {
         var keys = this.selectedRuns;
         var dict = this.run2tag;
diff --git a/tensorflow/tensorboard/components/tf-globals/globals.ts b/tensorflow/tensorboard/components/tf-globals/globals.ts
index 29e4a143e75..1e908ec034f 100644
--- a/tensorflow/tensorboard/components/tf-globals/globals.ts
+++ b/tensorflow/tensorboard/components/tf-globals/globals.ts
@@ -16,7 +16,7 @@ limitations under the License.
 /* tslint:disable:no-namespace */
 module TF.Globals {
   // The names of TensorBoard tabs.
-  export var TABS = ['events', 'images', 'audio', 'graphs', 'histograms'];
+  export var TABS = ['events', 'images', 'audio', 'graphs', 'distributions'];
 
   // If true, TensorBoard stores its hash in the URI state.
   // If false, tab switching in TensorBoard will not update location hash,
diff --git a/tensorflow/tensorboard/components/tf-histogram-dashboard/rebin.ts b/tensorflow/tensorboard/components/tf-histogram-dashboard/rebin.ts
deleted file mode 100644
index 92a6b12f456..00000000000
--- a/tensorflow/tensorboard/components/tf-histogram-dashboard/rebin.ts
+++ /dev/null
@@ -1,43 +0,0 @@
-module TF.Histogram {
-  /**
-   * Re-bins histogram data into uniform-width bins. Assumes a uniform
-   * distribution of values in given bins.
-   *
-   * @param bins - Original histogram data.
-   * @param numberOfBins Number of uniform-width bins to split the data into.
-   * @return Re-binned histogram data. Does not modify original data,
-   *      returns a new array.
-   */
-  export function rebinHistogram(
-      bins: TF.Backend.HistogramBin[], numberOfBins: number) {
-    if (bins.length === 0) {
-      return [];
-    }
-
-    var oldBinsXExtent = [
-      d3.min(bins, function(old: any) { return old.x; }),
-      d3.max(bins, function(old: any) { return old.x + old.dx; })
-    ];
-
-    var newDx: number = (oldBinsXExtent[1] - oldBinsXExtent[0]) / numberOfBins;
-
-    var newBins: TF.Backend.HistogramBin[] =
-        d3.range(oldBinsXExtent[0], oldBinsXExtent[1], newDx)
-            .map(function(newX) {
-
-              // Take the count of each existing bin, multiply it by the
-              // proportion of overlap with the new bin, then sum and store as
-              // the count for new bin. If no overlap, will add zero, if 100%
-              // overlap, will include full count into new bin.
-              var newY = d3.sum(bins.map(function(old) {
-                var intersectDx = Math.min(old.x + old.dx, newX + newDx) -
-                    Math.max(old.x, newX);
-                return (intersectDx > 0) ? (intersectDx / old.dx) * old.y : 0;
-              }));
-
-              return {x: newX, dx: newDx, y: newY};
-            });
-
-    return newBins;
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf-histogram-dashboard/test/index.html b/tensorflow/tensorboard/components/tf-histogram-dashboard/test/index.html
deleted file mode 100644
index c645f7251bd..00000000000
--- a/tensorflow/tensorboard/components/tf-histogram-dashboard/test/index.html
+++ /dev/null
@@ -1,13 +0,0 @@
-<!doctype html>
-<html>
-<head>
-  <meta charset="utf-8">
-  <script src="../../web-component-tester/browser.js"></script>
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <link rel="import" href="../../tf-imports/d3.html">
-</head>
-<body>
-  <script src="../rebin.js"></script>
-  <script src="rebinTests.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf-histogram-dashboard/test/rebinTests.ts b/tensorflow/tensorboard/components/tf-histogram-dashboard/test/rebinTests.ts
deleted file mode 100644
index 661ba75b54f..00000000000
--- a/tensorflow/tensorboard/components/tf-histogram-dashboard/test/rebinTests.ts
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-module TF.Histogram {
-  let assert = chai.assert;
-
-  describe('Rebin', function() {
-
-    var assertHistogramEquality = function(h1, h2) {
-      h1.forEach(function(b1, i) {
-        var b2 = h2[i];
-        assert.closeTo(b1.x, b2.x, 1e-10);
-        assert.closeTo(b1.dx, b2.dx, 1e-10);
-        assert.closeTo(b1.y, b2.y, 1e-10);
-      });
-    };
-
-    //
-    // Rebinning
-    //
-
-    it('Returns an empty array if you don\'t have any bins',
-       function() { assert.deepEqual(rebinHistogram([], 10), []); });
-
-    it('Collapses two bins into one.', function() {
-      var histogram = [
-        {x: 0, dx: 1, y: 1},
-        {x: 1, dx: 1, y: 2}
-      ];
-      var oneBin = [
-        {x: 0, dx: 2, y: 3}
-      ];
-      assertHistogramEquality(rebinHistogram(histogram, 1), oneBin);
-    });
-
-    it('Splits one bin into two.', function() {
-      var histogram = [
-        {x: 0, dx: 1, y: 3}
-      ];
-      var twoBin = [
-        {x: 0, dx: 0.5, y: 1.5},
-        {x: 0.5, dx: 0.5, y: 1.5}
-      ];
-      assertHistogramEquality(rebinHistogram(histogram, 2), twoBin);
-    });
-
-    it('Regularizes non-uniform bins.', function() {
-      var histogram = [
-        {x: 0, dx: 2, y: 3},
-        {x: 2, dx: 3, y: 3},
-        {x: 5, dx: 1, y: 1}
-      ];
-      var twoBin = [
-        {x: 0, dx: 3, y: 4},
-        {x: 3, dx: 3, y: 3}
-      ];
-      assertHistogramEquality(rebinHistogram(histogram, 2), twoBin);
-    });
-
-  });
-}
diff --git a/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-dashboard.html b/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-dashboard.html
index f65f31cc333..a79745b87e4 100644
--- a/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-dashboard.html
+++ b/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-dashboard.html
@@ -25,8 +25,6 @@ tf-image-dashboard displays a dashboard that loads images from a TensorFlow run.
 
     <style>
       .center {
-        padding-left: 10px;
-        padding-right: 10px;
         height: 100%;
         width: 100%;
         -webkit-box-sizing: border-box;
diff --git a/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-grid.html b/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-grid.html
index 5662d034a23..7c174e6abea 100644
--- a/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-grid.html
+++ b/tensorflow/tensorboard/components/tf-image-dashboard/tf-image-grid.html
@@ -91,7 +91,7 @@ is high)
         height: 100%;
         flex-direction: column;
         padding-top: 20px;
-        overflow: scroll;
+        overflow: auto;
         -webkit-box-sizing: border-box;
         -moz-box-sizing: border-box;
         box-sizing: border-box;
diff --git a/tensorflow/tensorboard/components/tf-multi-checkbox/tf-multi-checkbox.html b/tensorflow/tensorboard/components/tf-multi-checkbox/tf-multi-checkbox.html
index d230e54c86e..e397dba0703 100644
--- a/tensorflow/tensorboard/components/tf-multi-checkbox/tf-multi-checkbox.html
+++ b/tensorflow/tensorboard/components/tf-multi-checkbox/tf-multi-checkbox.html
@@ -21,13 +21,13 @@ handle these situations gracefully.
   <style include="run-color-style"></style>
 
   <template>
-    <div id="outer-container" class="scrollbar">
       <paper-input
         id="runs-regex"
         no-label-float
         label="Write a regex to filter runs"
         value="{{regexInput}}"
       ></paper-input>
+    <div id="outer-container" class="scrollbar">
       <template
         is="dom-repeat"
         items="[[namesMatchingRegex]]"
@@ -65,9 +65,10 @@ handle these situations gracefully.
       height: 100%;
     }
     #outer-container {
-      overflow-y: scroll;
+      overflow-y: auto;
       overflow-x: hidden;
       width: 100%;
+      height: 0; /* Quirk to make firefox add scrolling instead of expand div */
       flex-grow: 1;
       flex-shrink: 1;
       word-wrap: break-word;
@@ -202,8 +203,9 @@ handle these situations gracefully.
       window.requestAnimationFrame(function() {_this.updateStyles();});
     },
     _checkboxChange: function(e) {
-      var name = e.srcElement.name;
-      var checked = e.srcElement.checked;
+      var target = e.srcElement || e.target; // Firefox doesn't have srcElement.
+      var name = target.name;
+      var checked = target.checked;
       this.runToIsCheckedMapping[name] = checked;
       // n.b. notifyPath won't work because run names may have periods.
       this.runToIsCheckedMapping = _.clone(this.runToIsCheckedMapping);
diff --git a/tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html b/tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html
index d841562bad9..953f6474a62 100644
--- a/tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html
+++ b/tensorflow/tensorboard/components/tf-tensorboard/tf-tensorboard.html
@@ -7,7 +7,7 @@
 <link rel="import" href="../paper-header-panel/paper-header-panel.html">
 <link rel="import" href="../tf-globals/tf-globals.html">
 <link rel="import" href="../tf-event-dashboard/tf-event-dashboard.html">
-<link rel="import" href="../tf-histogram-dashboard/tf-histogram-dashboard.html">
+<link rel="import" href="../tf-distribution-dashboard/tf-distribution-dashboard.html">
 <link rel="import" href="../tf-image-dashboard/tf-image-dashboard.html">
 <link rel="import" href="../tf-audio-dashboard/tf-audio-dashboard.html">
 <link rel="import" href="../tf-graph-dashboard/tf-graph-dashboard.html">
@@ -90,11 +90,11 @@ allows the user to toggle between various dashboards.
           ></tf-graph-dashboard>
         </template>
 
-        <template is="dom-if" if="[[_modeIsHistograms(mode)]]">
-          <tf-histogram-dashboard
-            id="histograms"
+        <template is="dom-if" if="[[_modeIsDistributions(mode)]]">
+          <tf-distribution-dashboard
+            id="distributions"
             backend="[[_backend]]"
-          ></tf-histogram-dashboard>
+          ></tf-distribution-dashboard>
         </template>
       </div>
     </paper-header-panel>
@@ -227,8 +227,8 @@ allows the user to toggle between various dashboards.
       _modeIsGraphs: function(mode) {
         return mode === "graphs";
       },
-      _modeIsHistograms: function(mode) {
-        return mode === "histograms";
+      _modeIsDistributions: function(mode) {
+        return mode === "distributions";
       },
       selectedDashboard: function() {
         var dashboard = this.$$("#" + this.mode);
diff --git a/tensorflow/tensorboard/components/vz-line-chart/vz-chart-helpers.ts b/tensorflow/tensorboard/components/vz-line-chart/vz-chart-helpers.ts
index 39a296e9ba9..839f0fb8b24 100644
--- a/tensorflow/tensorboard/components/vz-line-chart/vz-chart-helpers.ts
+++ b/tensorflow/tensorboard/components/vz-line-chart/vz-chart-helpers.ts
@@ -21,6 +21,7 @@ module VZ.ChartHelpers {
 
   export interface Scalar {
     scalar: number;
+    smoothed: number;
   }
 
   export type ScalarDatum = Datum & Scalar;
diff --git a/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.html b/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.html
index 6fe110d9b9a..be2045ae9c9 100644
--- a/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.html
+++ b/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.html
@@ -14,13 +14,15 @@ smoothing.
 -->
 <dom-module id="vz-line-chart">
   <template>
-    <svg id="chartsvg"></svg>
     <div id="tooltip">
       <table>
         <thead>
           <tr>
             <th></th>
             <th>Name</th>
+            <template is="dom-if" if="{{smoothingEnabled}}">
+              <th>Smoothed</th>
+            </template>
             <th>Value</th>
             <th>Step</th>
             <th>Time</th>
@@ -31,6 +33,7 @@ smoothing.
         </tbody>
       </table>
     </div>
+    <svg id="chartsvg"></svg>
     <style>
       :host {
         -webkit-user-select: none;
@@ -160,6 +163,10 @@ smoothing.
         _seriesDataCache: {
           type: Object,
           value: function() { return {} }
+        },
+        _makeChartAsyncCallbackId: {
+          type: Number,
+          value: null
         }
       },
       observers: [
@@ -213,21 +220,25 @@ smoothing.
       detached: function() {
         this._attached = false;
       },
+      ready: function() {
+        this.scopeSubtree(this.$.tooltip, true);
+        this.scopeSubtree(this.$.chartsvg, true);
+      },
       _makeChart: function(xType, colorScale, _attached) {
-        if(!this._attached) {
-          return;
+        if (this._makeChartAsyncHandle === null) {
+          this.cancelAsync(this._makeChartAsyncCallbackId);
         }
 
-        if (this._chart) this._chart.destroy();
-        var tooltip = d3.select(this.$.tooltip);
-        this.scopeSubtree(this.$.tooltip, true);
-        var chart = new VZ.LineChart(xType, colorScale, tooltip);
-        var svg = d3.select(this.$.chartsvg);
-        this.async(function() {
+        this._makeChartAsyncHandle = this.async(function() {
+          this._makeChartAsyncCallbackId = null;
+          if (!this._attached) return;
+          if (this._chart) this._chart.destroy();
+          var tooltip = d3.select(this.$.tooltip);
+          var chart = new VZ.LineChart(xType, colorScale, tooltip);
+          var svg = d3.select(this.$.chartsvg);
           chart.renderTo(svg);
-          this.scopeSubtree(this.$.chartsvg, true);
           this._chart = chart;
-        }, 350);
+        }.bind(this), 350);
       },
       _reloadFromCache: function() {
         if(this._chart) {
diff --git a/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.ts b/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.ts
index 0eddd479e90..4e5d219f1c1 100644
--- a/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.ts
+++ b/tensorflow/tensorboard/components/vz-line-chart/vz-line-chart.ts
@@ -35,11 +35,10 @@ module VZ {
     private smoothLinePlot: Plottable.Plots.Line<number|Date>;
     private scatterPlot: Plottable.Plots.Scatter<number|Date, Number>;
     private nanDisplay: Plottable.Plots.Scatter<number|Date, Number>;
-    private yAccessor: Plottable.Accessor<number>;
+    private scalarAccessor: Plottable.Accessor<number>;
+    private smoothedAccessor: Plottable.Accessor<number>;
     private lastPointsDataset: Plottable.Dataset;
     private datasets: Plottable.Dataset[];
-    private smoothDatasets: Plottable.Dataset[];
-    private name2smoothDatasets: {[name: string]: Plottable.Dataset};
     private onDatasetChanged: (dataset: Plottable.Dataset) => void;
     private nanDataset: Plottable.Dataset;
     private smoothingDecay: number;
@@ -53,8 +52,6 @@ module VZ {
       this.colorScale = colorScale;
       this.tooltip = tooltip;
       this.datasets = [];
-      this.smoothDatasets = [];
-      this.name2smoothDatasets = {};
       // lastPointDataset is a dataset that contains just the last point of
       // every dataset we're currently drawing.
       this.lastPointsDataset = new Plottable.Dataset();
@@ -97,10 +94,11 @@ module VZ {
     }
 
     private buildPlot(xAccessor, xScale, yScale): Plottable.Component {
-      this.yAccessor = (d: VZ.ChartHelpers.ScalarDatum) => d.scalar;
+      this.scalarAccessor = (d: VZ.ChartHelpers.ScalarDatum) => d.scalar;
+      this.smoothedAccessor = (d: VZ.ChartHelpers.ScalarDatum) => d.smoothed;
       let linePlot = new Plottable.Plots.Line<number|Date>();
       linePlot.x(xAccessor, xScale);
-      linePlot.y(this.yAccessor, yScale);
+      linePlot.y(this.scalarAccessor, yScale);
       linePlot.attr(
           'stroke', (d: VZ.ChartHelpers.Datum, i: number,
                      dataset: Plottable.Dataset) =>
@@ -110,7 +108,7 @@ module VZ {
 
       let smoothLinePlot = new Plottable.Plots.Line<number|Date>();
       smoothLinePlot.x(xAccessor, xScale);
-      smoothLinePlot.y(this.yAccessor, yScale);
+      smoothLinePlot.y(this.smoothedAccessor, yScale);
       smoothLinePlot.attr(
           'stroke', (d: VZ.ChartHelpers.Datum, i: number,
                      dataset: Plottable.Dataset) =>
@@ -122,7 +120,7 @@ module VZ {
       // visible. We hide it when tooltips are active to keep things clean.
       let scatterPlot = new Plottable.Plots.Scatter<number|Date, number>();
       scatterPlot.x(xAccessor, xScale);
-      scatterPlot.y(this.yAccessor, yScale);
+      scatterPlot.y(this.scalarAccessor, yScale);
       scatterPlot.attr('fill', (d: any) => this.colorScale.scale(d.name));
       scatterPlot.attr('opacity', 1);
       scatterPlot.size(VZ.ChartHelpers.TOOLTIP_CIRCLE_SIZE * 2);
@@ -148,10 +146,10 @@ module VZ {
      */
     private _onDatasetChanged(dataset: Plottable.Dataset) {
       if (this.smoothingEnabled) {
-        this.resmoothDataset(this.getSmoothDataset(dataset.metadata().name));
-        this.updateSpecialDatasets(this.smoothDatasets);
+        this.resmoothDataset(dataset);
+        this.updateSpecialDatasets(this.smoothedAccessor);
       } else {
-        this.updateSpecialDatasets(this.datasets);
+        this.updateSpecialDatasets(this.scalarAccessor);
       }
     }
 
@@ -159,14 +157,16 @@ module VZ {
      * values from all of the regular datasets, e.g. last points in series, or
      * NaN values. Those points will have a `name` and `relative` property added
      * (since usually those are context in the surrounding dataset).
+     * The accessor will point to the correct data to access.
      */
-    private updateSpecialDatasets(datasets: Plottable.Dataset[]) {
+    private updateSpecialDatasets(accessor: Plottable.Accessor<number>) {
       let lastPointsData =
-          datasets
+          this.datasets
               .map((d) => {
                 let datum = null;
                 // filter out NaNs to ensure last point is a clean one
-                let nonNanData = d.data().filter((x) => !isNaN(x.scalar));
+                let nonNanData =
+                    d.data().filter((x) => !isNaN(accessor(x, -1, d)));
                 if (nonNanData.length > 0) {
                   let idx = nonNanData.length - 1;
                   datum = nonNanData[idx];
@@ -187,8 +187,8 @@ module VZ {
         let data = d.data();
         let i = 0;
         while (i < data.length && displayY == null) {
-          if (!isNaN(data[i].scalar)) {
-            displayY = data[i].scalar;
+          if (!isNaN(accessor(data[i], -1, d))) {
+            displayY = accessor(data[i], -1, d);
           }
           i++;
         }
@@ -197,8 +197,8 @@ module VZ {
         }
         let nanData = [];
         for (i = 0; i < data.length; i++) {
-          if (!isNaN(data[i].scalar)) {
-            displayY = data[i].scalar;
+          if (!isNaN(accessor(data[i], -1, d))) {
+            displayY = accessor(data[i], -1, d);
           } else {
             data[i].name = d.metadata().name;
             data[i].displayY = displayY;
@@ -208,7 +208,7 @@ module VZ {
         }
         return nanData;
       };
-      let nanData = _.flatten(datasets.map(datasetToNaNData));
+      let nanData = _.flatten(this.datasets.map(datasetToNaNData));
       this.nanDataset.data(nanData);
     }
 
@@ -250,10 +250,8 @@ module VZ {
 
         let centerBBox: SVGRect =
             (<any>this.gridlines.content().node()).getBBox();
-        let datasets =
-            this.smoothingEnabled ? this.smoothDatasets : plot.datasets();
-        let points =
-            datasets.map((dataset) => this.findClosestPoint(target, dataset));
+        let points = plot.datasets().map(
+            (dataset) => this.findClosestPoint(target, dataset));
         let pointsToCircle = points.filter(
             (p) => p != null &&
                 Plottable.Utils.DOM.intersectsBBox(p.x, p.y, centerBBox));
@@ -309,7 +307,7 @@ module VZ {
         let firstX =
             this.xScale.scale(this.xAccessor(firstPoint, 0, d.dataset));
         let lastX = this.xScale.scale(this.xAccessor(lastPoint, 0, d.dataset));
-        let s = d.datum.scalar;
+        let s = this.smoothingEnabled ? d.datum.smoothed : d.datum.scalar;
         let yD = this.yScale.domain();
         return target.x < firstX || target.x > lastX || s < yD[0] ||
             s > yD[1] || isNaN(s);
@@ -330,6 +328,11 @@ module VZ {
               'background-color',
               (d) => this.colorScale.scale(d.dataset.metadata().name));
       rows.append('td').text((d) => d.dataset.metadata().name);
+      if (this.smoothingEnabled) {
+        rows.append('td').text(
+            (d) => isNaN(d.datum.smoothed) ? 'NaN' :
+                                             valueFormatter(d.datum.smoothed));
+      }
       rows.append('td').text(
           (d) =>
               isNaN(d.datum.scalar) ? 'NaN' : valueFormatter(d.datum.scalar));
@@ -349,18 +352,9 @@ module VZ {
       // prevent it from falling off the right side of the screen
       let left =
           Math.min(0, documentWidth - parentRect.left - nodeRect.width - 60);
-      this.tooltip.style('left', left + 'px');
-      // compute top position
-      if (parentRect.bottom + nodeRect.height +
-              VZ.ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET <
-          document.body.clientHeight) {
-        this.tooltip.style(
-            'top', parentRect.bottom + VZ.ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET);
-      } else {
-        this.tooltip.style(
-            'bottom', parentRect.top - VZ.ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET);
-      }
-
+      let top = parentRect.height + VZ.ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET;
+      this.tooltip.style(
+          'transform', 'translate(' + left + 'px,' + top + 'px)');
       this.tooltip.style('opacity', 1);
     }
 
@@ -369,7 +363,8 @@ module VZ {
         dataset: Plottable.Dataset): VZ.ChartHelpers.Point {
       let points: VZ.ChartHelpers.Point[] = dataset.data().map((d, i) => {
         let x = this.xAccessor(d, i, dataset);
-        let y = this.yAccessor(d, i, dataset);
+        let y = this.smoothingEnabled ? this.smoothedAccessor(d, i, dataset) :
+                                        this.scalarAccessor(d, i, dataset);
         return {
           x: this.xScale.scale(x),
           y: this.yScale.scale(y),
@@ -392,28 +387,18 @@ module VZ {
       }
     }
 
-    private getSmoothDataset(name: string) {
-      if (this.name2smoothDatasets[name] === undefined) {
-        this.name2smoothDatasets[name] =
-            new Plottable.Dataset([], {name: name});
-      }
-      return this.name2smoothDatasets[name];
-    }
-
     private resmoothDataset(dataset: Plottable.Dataset) {
-      let unsmoothedData = this.getDataset(dataset.metadata().name).data();
+      let data = dataset.data();
 
       // EMA with first step initialized to first element.
-      let smoothedData = _.cloneDeep(unsmoothedData);
-      smoothedData.forEach((d, i) => {
+      data.forEach((d, i) => {
         if (i === 0) {
-          return;
+          d.smoothed = d.scalar;
+        } else {
+          d.smoothed = (1.0 - this.smoothingDecay) * d.scalar +
+              this.smoothingDecay * data[i - 1].smoothed;
         }
-        d.scalar = (1.0 - this.smoothingDecay) * d.scalar +
-            this.smoothingDecay * smoothedData[i - 1].scalar;
       });
-
-      dataset.data(smoothedData);
     }
 
     private getDataset(name: string) {
@@ -434,11 +419,6 @@ module VZ {
       this.datasets = names.map((r) => this.getDataset(r));
       this.datasets.forEach((d) => d.onUpdate(this.onDatasetChanged));
       this.linePlot.datasets(this.datasets);
-
-      if (this.smoothingEnabled) {
-        this.smoothDatasets = names.map((r) => this.getSmoothDataset(r));
-        this.smoothLinePlot.datasets(this.smoothDatasets);
-      }
     }
 
     /**
@@ -449,26 +429,26 @@ module VZ {
     }
 
     public smoothingUpdate(decay: number) {
+      this.smoothingDecay = decay;
+      this.datasets.forEach((d) => this.resmoothDataset(d));
+
       if (!this.smoothingEnabled) {
         this.linePlot.addClass('ghost');
+        this.scatterPlot.y(this.smoothedAccessor, this.yScale);
         this.smoothingEnabled = true;
-        this.smoothDatasets =
-            this.seriesNames.map((r) => this.getSmoothDataset(r));
-        this.smoothLinePlot.datasets(this.smoothDatasets);
+        this.smoothLinePlot.datasets(this.datasets);
       }
 
-      this.smoothingDecay = decay;
-      this.smoothDatasets.forEach((d) => this.resmoothDataset(d));
-      this.updateSpecialDatasets(this.smoothDatasets);
+      this.updateSpecialDatasets(this.smoothedAccessor);
     }
 
     public smoothingDisable() {
       if (this.smoothingEnabled) {
         this.linePlot.removeClass('ghost');
-        this.smoothDatasets = [];
-        this.smoothLinePlot.datasets(this.smoothDatasets);
+        this.scatterPlot.y(this.scalarAccessor, this.yScale);
+        this.smoothLinePlot.datasets([]);
         this.smoothingEnabled = false;
-        this.updateSpecialDatasets(this.datasets);
+        this.updateSpecialDatasets(this.scalarAccessor);
       }
     }
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 8c92b0bf780..194309b134b 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -634,6 +634,7 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
                    srcs=srcs,
                    deps=deps + if_cuda(cuda_deps),
                    data=[name + "_check_deps"],
+                   copts=tf_copts(),
                    linkshared=1,
                    linkopts = select({
                        "//conditions:default": [
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index e02e51eae07..e7749ab0f70 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -1 +1,2 @@
 *tensorflow*
+*perftools*gputools*
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index 61ffb8d29aa..8c8c8be5a93 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -1,6 +1,7 @@
 tensorflow {
   global:
     *tensorflow*;
+    *perftools*gputools*;
   local:
     *;
 };
diff --git a/tensorflow/tools/ci_build/Dockerfile.cmake b/tensorflow/tools/ci_build/Dockerfile.cmake
new file mode 100644
index 00000000000..4b2e7d3c87f
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.cmake
@@ -0,0 +1,16 @@
+FROM ubuntu:16.04
+
+MAINTAINER Shanqing Cai <cais@google.com>
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+RUN /install/install_deb_packages.sh
+RUN /install/install_proto3_from_source.sh
+
+RUN pip install --upgrade numpy
+
+# Install golang
+RUN add-apt-repository -y ppa:ubuntu-lxc/lxd-stable
+RUN apt-get update
+RUN apt-get install -y golang
diff --git a/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu b/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
index 2ed7a308241..fa74320b1e5 100644
--- a/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
@@ -10,6 +10,9 @@ RUN /install/install_deb_packages.sh
 RUN /install/install_pip_packages.sh
 RUN /install/install_bazel.sh
 
+# Fix a virtualenv install issue specific to Debian Jessie.
+RUN pip install --upgrade virtualenv
+
 # Set up bazelrc.
 COPY install/.bazelrc /root/.bazelrc
 ENV BAZELRC /root/.bazelrc
diff --git a/tensorflow/tools/ci_build/builds/cmake.sh b/tensorflow/tools/ci_build/builds/cmake.sh
index fe8e84152bb..fe236096782 100755
--- a/tensorflow/tools/ci_build/builds/cmake.sh
+++ b/tensorflow/tools/ci_build/builds/cmake.sh
@@ -16,7 +16,34 @@
 
 set -e
 
+
+# Determine the number of cores, for parallel make.
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+if [[ -z ${N_JOBS} ]]; then
+  # The Linux way didn't work. Try the Mac way.
+  N_JOBS=$(sysctl -n hw.ncpu)
+fi
+if [[ -z ${N_JOBS} ]]; then
+  N_JOBS=1
+  echo ""
+  echo "WARNING: Failed to determine the number of CPU cores. "\
+"Will use --jobs=1 for make."
+fi
+
+echo ""
+echo "make will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+
+# Run TensorFlow cmake build.
+# Clean up, because certain modules, e.g., highwayhash, seem to be sensitive
+# to state.
+rm -rf build
+
 mkdir -p build
-cd build
+pushd build
+
 cmake -DCMAKE_BUILD_TYPE=Release ../tensorflow/contrib/cmake
-make all
+make --jobs=${N_JOBS} all
+
+popd
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 5ee57da4b3d..8dffbfd2d95 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -106,7 +106,8 @@ fi
 
 PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
 GPU_FLAG=""
-if [[ ${CONTAINER_TYPE} == "cpu" ]]; then
+if [[ ${CONTAINER_TYPE} == "cpu" ]] || \
+   [[ ${CONTAINER_TYPE} == "debian.jessie.cpu" ]]; then
   bazel build -c opt ${MAVX_FLAG} ${PIP_BUILD_TARGET} || \
       die "Build failed."
 elif [[ ${CONTAINER_TYPE} == "gpu" ]]; then
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index b231a9c202e..73464ffc04b 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -191,7 +191,7 @@ if [[ -z "$(which docker)" ]]; then
 fi
 
 # Process container type
-if [[ ${CTYPE} == "cpu" ]]; then
+if [[ ${CTYPE} == "cpu" ]] || [[ ${CTYPE} == "debian.jessie.cpu" ]]; then
   :
 elif [[ ${CTYPE} == "gpu" ]]; then
   OPT_FLAG="${OPT_FLAG} --config=cuda"
@@ -298,7 +298,9 @@ if [[ ${TF_BUILD_IS_PIP} == "no_pip" ]] ||
     BAZEL_TARGET=${TF_BUILD_BAZEL_TARGET}
   fi
 
-  if [[ ${CTYPE} == "cpu" ]] || [[ ${CTYPE} == "gpu" ]]; then
+  if [[ ${CTYPE} == "cpu" ]] || \
+     [[ ${CTYPE} == "debian.jessie.cpu" ]] || \
+     [[ ${CTYPE} == "gpu" ]]; then
     # Run Bazel
     NO_PIP_MAIN_CMD="${MAIN_CMD} ${BAZEL_CMD} ${OPT_FLAG} "\
 "${EXTRA_ARGS} ${BAZEL_TARGET}"
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index f033c2514ab..3c5ccb0a04e 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -17,7 +17,7 @@
 set -e
 
 # Select bazel version.
-BAZEL_VERSION="0.3.0"
+BAZEL_VERSION="0.3.1"
 
 # Install bazel.
 mkdir /bazel
diff --git a/tensorflow/tools/ci_build/install/install_proto3.sh b/tensorflow/tools/ci_build/install/install_proto3.sh
index c4c613b6349..2f1b7dd175c 100755
--- a/tensorflow/tools/ci_build/install/install_proto3.sh
+++ b/tensorflow/tools/ci_build/install/install_proto3.sh
@@ -19,7 +19,7 @@ set -e
 # Install protobuf3.
 
 # Select protobuf version.
-PROTOBUF_VERSION="3.0.0-beta-2"
+PROTOBUF_VERSION="3.0.0"
 
 PROTOBUF_URL="https://github.com/google/protobuf/releases/download/v${PROTOBUF_VERSION}/protoc-${PROTOBUF_VERSION}-linux-x86_64.zip"
 PROTOBUF_ZIP=$(basename "${PROTOBUF_URL}")
@@ -27,7 +27,7 @@ UNZIP_DEST="google-protobuf"
 
 wget -q "${PROTOBUF_URL}"
 unzip "${PROTOBUF_ZIP}" -d "${UNZIP_DEST}"
-cp "${UNZIP_DEST}/protoc" /usr/local/bin/
+cp "${UNZIP_DEST}/bin/protoc" /usr/local/bin/
 
 rm -f "${PROTOBUF_ZIP}"
 rm -rf "${UNZIP_DEST}"
diff --git a/tensorflow/tools/ci_build/install/install_proto3_from_source.sh b/tensorflow/tools/ci_build/install/install_proto3_from_source.sh
new file mode 100755
index 00000000000..5a729b7c134
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_proto3_from_source.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+# Install protobuf3 from source.
+
+# Determine the number of cores, for parallel make.
+N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+if [[ -z ${N_JOBS} ]]; then
+  # The Linux way didn't work. Try the Mac way.
+  N_JOBS=$(sysctl -n hw.ncpu)
+fi
+if [[ -z ${N_JOBS} ]]; then
+  N_JOBS=1
+  echo ""
+  echo "WARNING: Failed to determine the number of CPU cores. "\
+"Will use --jobs=1 for make."
+fi
+
+echo ""
+echo "make will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+
+# Build and install protobuf.
+PROTOBUF_VERSION="3.0.0-beta-2"
+PROTOBUF_DOWNLOAD_DIR="/tmp/protobuf"
+
+mkdir "${PROTOBUF_DOWNLOAD_DIR}"
+pushd "${PROTOBUF_DOWNLOAD_DIR}"
+curl -fSsL -O https://github.com/google/protobuf/releases/download/v$PROTOBUF_VERSION/protobuf-cpp-$PROTOBUF_VERSION.tar.gz
+tar zxf protobuf-cpp-$PROTOBUF_VERSION.tar.gz
+cd protobuf-$PROTOBUF_VERSION
+./autogen.sh
+./configure
+make --jobs=${N_JOBS}
+sudo make install
+make clean
+sudo ldconfig
+popd
+rm -rf "${PROTOBUF_DOWNLOAD_DIR}"
diff --git a/tensorflow/tools/ci_build/update_version.sh b/tensorflow/tools/ci_build/update_version.sh
index 2e8a4a57bc0..dd02d02d17f 100755
--- a/tensorflow/tools/ci_build/update_version.sh
+++ b/tensorflow/tools/ci_build/update_version.sh
@@ -101,7 +101,7 @@ OS_SETUP="${TF_SRC_DIR}/g3doc/get_started/os_setup.md"
 check_existence file "${OS_SETUP}"
 
 sed -i -r -e "s/(.*pip[0-9]* install .*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${OS_SETUP}"
-
+sed -i -r -e "s/(.*export TF_BINARY_URL.*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${OS_SETUP}"
 sed -i -r -e "s/(.*\(e\.g\..*[^0-9])([0-9]+\.[0-9]+\.[[:alnum:]]+)(-gpu.*)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${OS_SETUP}"
 
 
@@ -131,6 +131,13 @@ check_existence file "${TEST_SERVER_DOCKER_FILE}"
 
 sed -i -r -e "s/(.*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${TEST_SERVER_DOCKER_FILE}"
 
+# Update tensorflow/tools/gcs_test/Dockerfile
+GCS_TEST_DOCKER_FILE="${TF_SRC_DIR}/tools/gcs_test/Dockerfile"
+
+check_existence file "${GCS_TEST_DOCKER_FILE}"
+
+sed -i -r -e "s/(.*tensorflow-)([0-9]+\.[0-9]+\.[[:alnum:]]+)(-.*\.whl)/\1${MAJOR}.${MINOR}.${PATCH}\3/g" "${GCS_TEST_DOCKER_FILE}"
+
 
 # Updates to be made if there are major / minor version changes
 MAJOR_MINOR_CHANGE=0
diff --git a/tensorflow/tools/dist_test/Dockerfile b/tensorflow/tools/dist_test/Dockerfile
index 66787ca7f8b..f39046252ba 100644
--- a/tensorflow/tools/dist_test/Dockerfile
+++ b/tensorflow/tools/dist_test/Dockerfile
@@ -20,7 +20,7 @@ RUN /var/gcloud/google-cloud-sdk/bin/gcloud components install kubectl
 # Install nightly TensorFlow pip
 # TODO(cais): Should we build it locally instead?
 RUN pip install \
-    http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
+    http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Copy test files
 COPY scripts /var/tf-dist-test/scripts
diff --git a/tensorflow/tools/dist_test/server/Dockerfile b/tensorflow/tools/dist_test/server/Dockerfile
index c3bf751735e..68bacefaca0 100644
--- a/tensorflow/tools/dist_test/server/Dockerfile
+++ b/tensorflow/tools/dist_test/server/Dockerfile
@@ -36,7 +36,7 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
 
 # Install TensorFlow CPU version from nightly build
 RUN pip --no-cache-dir install \
-    http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
+    http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Copy files, including the GRPC server binary at
 # server/grpc_tensorflow_server.py
diff --git a/tensorflow/tools/dist_test/server/Dockerfile.test b/tensorflow/tools/dist_test/server/Dockerfile.test
index de4411a05cd..f0895acc5e4 100644
--- a/tensorflow/tools/dist_test/server/Dockerfile.test
+++ b/tensorflow/tools/dist_test/server/Dockerfile.test
@@ -42,7 +42,7 @@ RUN pip install --upgrade pandas==0.18.1
 
 # Install TensorFlow CPU version.
 RUN pip --no-cache-dir install \
-    http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
+    http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Copy files, including the GRPC server binary at
 # server/grpc_tensorflow_server.py
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 31c3cd4d30a..3bdebd69b91 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -32,7 +32,7 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
-ENV TENSORFLOW_VERSION 0.9.0
+ENV TENSORFLOW_VERSION 0.10.0rc0
 
 # --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 # These lines will be edited automatically by parameterized_docker_build.sh. #
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 5e8693525be..4f994bdbc8e 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -81,7 +81,7 @@ RUN mkdir /bazel && \
 
 RUN git clone --recursive https://github.com/tensorflow/tensorflow.git && \
     cd tensorflow && \
-    git checkout r0.9
+    git checkout r0.10
 WORKDIR /tensorflow
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 2be630b48c4..e9081d5502f 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -80,9 +80,9 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 
-RUN git clone -b r0.9 --recursive --recurse-submodules https://github.com/tensorflow/tensorflow.git && \
+RUN git clone -b r0.10 --recursive --recurse-submodules https://github.com/tensorflow/tensorflow.git && \
     cd tensorflow && \
-    git checkout r0.9
+    git checkout r0.10
 WORKDIR /tensorflow
 
 # Configure the build for our CUDA configuration.
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index db91720cd9e..e08ef1aa758 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -32,7 +32,7 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
-ENV TENSORFLOW_VERSION 0.9.0
+ENV TENSORFLOW_VERSION 0.10.0rc0
 
 # --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 # These lines will be edited automatically by parameterized_docker_build.sh. #
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 5a1324e09fa..bfae655076b 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -179,8 +179,10 @@ if [[ "${DO_PIP_BUILD}" == "1" ]]; then
   export TF_BUILD_IS_OPT="OPT"
   export TF_BUILD_IS_PIP="PIP"
 
-  export TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS=\
-"-e TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2"
+  if [[ "${TF_DOCKER_BUILD_TYPE}" == "gpu" ]]; then
+    export TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS=\
+"${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS} -e TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2"
+  fi
 
   pushd "${SCRIPT_DIR}/../../../"
   rm -rf pip_test/whl &&
diff --git a/tensorflow/tools/gcs_test/Dockerfile b/tensorflow/tools/gcs_test/Dockerfile
index be3ad40b157..2831a07de76 100644
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@@ -16,7 +16,7 @@ RUN ./install_google_cloud_sdk.bash --disable-prompts --install-dir=/var/gcloud
 
 # Install nightly TensorFlow pip
 RUN pip install \
-   http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
+   http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_CONTAINER_TYPE=CPU,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.10.0rc0-cp27-none-linux_x86_64.whl
 
 # Copy test files
 RUN mkdir -p /gcs-smoke/python
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index b3787c0edc5..1fda9fd49fb 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -27,7 +27,7 @@ from setuptools import find_packages, setup, Command, Extension
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
-_VERSION = '0.9.0'
+_VERSION = '0.10.0rc0'
 
 numpy_version = "1.8.2"
 if platform.system() == "Darwin":
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b9bab8a79d7..dd2bf4adc65 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -90,7 +90,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   native.new_http_archive(
     name = "gmock_archive",
-    url = "https://archive.openswitch.net/gmock-1.7.0.zip",
+    url = "http://pkgs.fedoraproject.org/repo/pkgs/gmock/gmock-1.7.0.zip/073b984d8798ea1594f5e44d85b20d66/gmock-1.7.0.zip",
     sha256 = "26fcbb5925b74ad5fc8c26b0495dfc96353f4d553492eb97e85a8a6d2f43095b",
     build_file = path_prefix + "gmock.BUILD",
   )
diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD
index 9062ed2ec0d..15534fa9612 100644
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@@ -8,6 +8,7 @@ cc_library(
         "Eigen/Cholesky",
         "Eigen/Eigenvalues",
         "Eigen/QR",
+        "Eigen/SVD",
         "unsupported/Eigen/SpecialFunctions",
         "unsupported/Eigen/CXX11/Tensor",
         "unsupported/Eigen/CXX11/FixedPoint",
diff --git a/third_party/eigen3/Eigen/SVD b/third_party/eigen3/Eigen/SVD
new file mode 100644
index 00000000000..eecf47c1031
--- /dev/null
+++ b/third_party/eigen3/Eigen/SVD
@@ -0,0 +1 @@
+#include "Eigen/SVD"
diff --git a/third_party/gpus/crosstool/CROSSTOOL b/third_party/gpus/crosstool/CROSSTOOL
index 8db81a9603b..f72bb9321a7 100644
--- a/third_party/gpus/crosstool/CROSSTOOL
+++ b/third_party/gpus/crosstool/CROSSTOOL
@@ -18,6 +18,10 @@ default_toolchain {
   cpu: "darwin"
   toolchain_identifier: "local_darwin"
 }
+default_toolchain {
+  cpu: "ppc"
+  toolchain_identifier: "local_linux"
+}
 
 toolchain {
   abi_version: "local"